/work/svt-av1/Source/Lib/Codec/transforms.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright(c) 2019 Intel Corporation |
3 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
4 | | * |
5 | | * This source code is subject to the terms of the BSD 2 Clause License and |
6 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
7 | | * was not distributed with this source code in the LICENSE file, you can |
8 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
9 | | * Media Patent License 1.0 was not distributed with this source code in the |
10 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
11 | | */ |
12 | | |
13 | | #include <stdlib.h> |
14 | | #include "transforms.h" |
15 | | #include "aom_dsp_rtcd.h" |
16 | | |
17 | | const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/][MAX_TXWH_IDX /*txh_idx*/] = { |
18 | | {13, 13, 13, 0, 0}, {13, 13, 13, 12, 0}, {13, 13, 13, 12, 13}, {0, 13, 13, 12, 13}, {0, 0, 13, 12, 13}}; |
19 | | const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/][MAX_TXWH_IDX /*txh_idx*/] = { |
20 | | {13, 13, 12, 0, 0}, {13, 13, 13, 12, 0}, {13, 13, 12, 13, 12}, {0, 12, 13, 12, 11}, {0, 0, 12, 11, 10}}; |
21 | | |
22 | | const uint8_t tx_blocks_per_depth[BLOCK_SIZES_ALL][MAX_VARTX_DEPTH + 1] = { |
23 | | {1, 1, 1}, // BLOCK_4X4 |
24 | | {1, 1, 1}, // BLOCK_4X8 |
25 | | {1, 1, 1}, // BLOCK_8X4 |
26 | | {1, 4, 4}, // BLOCK_8X8 |
27 | | {1, 2, 8}, // BLOCK_8X16 |
28 | | {1, 2, 8}, // BLOCK_16X8 |
29 | | {1, 4, 16}, // BLOCK_16X16 |
30 | | {1, 2, 8}, // BLOCK_16X32 |
31 | | {1, 2, 8}, // BLOCK_32X16 |
32 | | {1, 4, 16}, // BLOCK_32X32 |
33 | | {1, 2, 8}, // BLOCK_32X64 |
34 | | {1, 2, 8}, // BLOCK_64X32 |
35 | | {1, 4, 16}, // BLOCK_64X64 |
36 | | {2, 2, 2}, // BLOCK_64X128 |
37 | | {2, 2, 2}, // BLOCK_128X64 |
38 | | {4, 4, 4}, // BLOCK_128X128 |
39 | | {1, 2, 4}, // BLOCK_4X16 |
40 | | {1, 2, 4}, // BLOCK_16X4 |
41 | | {1, 2, 4}, // BLOCK_8X32 |
42 | | {1, 2, 4}, // BLOCK_32X8 |
43 | | {1, 2, 4}, // BLOCK_16X64 |
44 | | {1, 2, 4} // BLOCK_64X16 |
45 | | }; |
46 | | |
47 | | // origin is block - separate tables for INTRA (idx 0) and INTER (idx 1) needed b/c of tx depth 2 |
48 | | const Position tx_org[BLOCK_SIZES_ALL][2 /*is_inter*/][MAX_VARTX_DEPTH + 1][MAX_TXB_COUNT] = { |
49 | | {// BLOCK_4X4 |
50 | | {// intra |
51 | | {// tx_depth 0 |
52 | | {0, 0}}, |
53 | | {// tx_depth 1 |
54 | | {0, 0}}, |
55 | | {// tx_depth 2 |
56 | | {0, 0}}}, |
57 | | {// inter |
58 | | {// tx_depth 0 |
59 | | {0, 0}}, |
60 | | {// tx_depth 1 |
61 | | {0, 0}}, |
62 | | {// tx_depth 2 |
63 | | {0, 0}}}}, |
64 | | {// BLOCK_4X8 |
65 | | {// intra |
66 | | {// tx_depth 0 |
67 | | {0, 0}}, |
68 | | {// tx_depth 1 |
69 | | {0, 0}}, |
70 | | {// tx_depth 2 |
71 | | {0, 0}}}, |
72 | | {// inter |
73 | | {// tx_depth 0 |
74 | | {0, 0}}, |
75 | | {// tx_depth 1 |
76 | | {0, 0}}, |
77 | | {// tx_depth 2 |
78 | | {0, 0}}}}, |
79 | | {// BLOCK_8X4 |
80 | | {// intra |
81 | | {// tx_depth 0 |
82 | | {0, 0}}, |
83 | | {// tx_depth 1 |
84 | | {0, 0}}, |
85 | | {// tx_depth 2 |
86 | | {0, 0}}}, |
87 | | {// inter |
88 | | {// tx_depth 0 |
89 | | {0, 0}}, |
90 | | {// tx_depth 1 |
91 | | {0, 0}}, |
92 | | {// tx_depth 2 |
93 | | {0, 0}}}}, |
94 | | {// BLOCK_8X8 |
95 | | {// intra |
96 | | {// tx_depth 0 |
97 | | {0, 0}}, |
98 | | {// tx_depth 1 |
99 | | {0, 0}, |
100 | | {4, 0}, |
101 | | {0, 4}, |
102 | | {4, 4}}, |
103 | | { |
104 | | // tx_depth 2 |
105 | | {0, 0} // not allowed |
106 | | }}, |
107 | | {// inter |
108 | | {// tx_depth 0 |
109 | | {0, 0}}, |
110 | | {// tx_depth 1 |
111 | | {0, 0}, |
112 | | {4, 0}, |
113 | | {0, 4}, |
114 | | {4, 4}}, |
115 | | { |
116 | | // tx_depth 2 |
117 | | {0, 0} // not allowed |
118 | | }}}, |
119 | | {// BLOCK_8X16 |
120 | | {// intra |
121 | | {// tx_depth 0 |
122 | | {0, 0}}, |
123 | | {// tx_depth 1 |
124 | | {0, 0}, |
125 | | {0, 8}}, |
126 | | {// tx_depth 2 |
127 | | {0, 0}, |
128 | | {4, 0}, |
129 | | {0, 4}, |
130 | | {4, 4}, |
131 | | {0, 8}, |
132 | | {4, 8}, |
133 | | {0, 12}, |
134 | | {4, 12}}}, |
135 | | {// inter |
136 | | {// tx_depth 0 |
137 | | {0, 0}}, |
138 | | {// tx_depth 1 |
139 | | {0, 0}, |
140 | | {0, 8}}, |
141 | | {// tx_depth 2 |
142 | | {0, 0}, |
143 | | {4, 0}, |
144 | | {0, 4}, |
145 | | {4, 4}, |
146 | | {0, 8}, |
147 | | {4, 8}, |
148 | | {0, 12}, |
149 | | {4, 12}}}}, |
150 | | {// BLOCK_16X8 |
151 | | {// intra |
152 | | {// tx_depth 0 |
153 | | {0, 0}}, |
154 | | {// tx_depth 1 |
155 | | {0, 0}, |
156 | | {8, 0}}, |
157 | | {// tx_depth 2 |
158 | | {0, 0}, |
159 | | {4, 0}, |
160 | | {8, 0}, |
161 | | {12, 0}, |
162 | | {0, 4}, |
163 | | {4, 4}, |
164 | | {8, 4}, |
165 | | {12, 4}}}, |
166 | | {// inter |
167 | | {// tx_depth 0 |
168 | | {0, 0}}, |
169 | | {// tx_depth 1 |
170 | | {0, 0}, |
171 | | {8, 0}}, |
172 | | {// tx_depth 2 |
173 | | {0, 0}, |
174 | | {4, 0}, |
175 | | {0, 4}, |
176 | | {4, 4}, |
177 | | {8, 0}, |
178 | | {12, 0}, |
179 | | {8, 4}, |
180 | | {12, 4}}}}, |
181 | | {// BLOCK_16X16 |
182 | | {// intra |
183 | | {// tx_depth 0 |
184 | | {0, 0}}, |
185 | | {// tx_depth 1 |
186 | | {0, 0}, |
187 | | {8, 0}, |
188 | | {0, 8}, |
189 | | {8, 8}}, |
190 | | {// tx_depth 2 |
191 | | {0, 0}, |
192 | | {4, 0}, |
193 | | {8, 0}, |
194 | | {12, 0}, |
195 | | {0, 4}, |
196 | | {4, 4}, |
197 | | {8, 4}, |
198 | | {12, 4}, |
199 | | {0, 8}, |
200 | | {4, 8}, |
201 | | {8, 8}, |
202 | | {12, 8}, |
203 | | {0, 12}, |
204 | | {4, 12}, |
205 | | {8, 12}, |
206 | | {12, 12}}}, |
207 | | {// inter |
208 | | {// tx_depth 0 |
209 | | {0, 0}}, |
210 | | {// tx_depth 1 |
211 | | {0, 0}, |
212 | | {8, 0}, |
213 | | {0, 8}, |
214 | | {8, 8}}, |
215 | | {// tx_depth 2 |
216 | | {0, 0}, |
217 | | {4, 0}, |
218 | | {0, 4}, |
219 | | {4, 4}, |
220 | | {8, 0}, |
221 | | {12, 0}, |
222 | | {8, 4}, |
223 | | {12, 4}, |
224 | | {0, 8}, |
225 | | {4, 8}, |
226 | | {0, 12}, |
227 | | {4, 12}, |
228 | | {8, 8}, |
229 | | {12, 8}, |
230 | | {8, 12}, |
231 | | {12, 12}}}}, |
232 | | {// BLOCK_16X32 |
233 | | {// intra |
234 | | {// tx_depth 0 |
235 | | {0, 0}}, |
236 | | {// tx_depth 1 |
237 | | {0, 0}, |
238 | | {0, 16}}, |
239 | | {// tx_depth 2 |
240 | | {0, 0}, |
241 | | {8, 0}, |
242 | | {0, 8}, |
243 | | {8, 8}, |
244 | | {0, 16}, |
245 | | {8, 16}, |
246 | | {0, 24}, |
247 | | {8, 24}}}, |
248 | | {// inter |
249 | | {// tx_depth 0 |
250 | | {0, 0}}, |
251 | | {// tx_depth 1 |
252 | | {0, 0}, |
253 | | {0, 16}}, |
254 | | {// tx_depth 2 |
255 | | {0, 0}, |
256 | | {8, 0}, |
257 | | {0, 8}, |
258 | | {8, 8}, |
259 | | {0, 16}, |
260 | | {8, 16}, |
261 | | {0, 24}, |
262 | | {8, 24}}}}, |
263 | | {// BLOCK_32X16 |
264 | | {// intra |
265 | | {// tx_depth 0 |
266 | | {0, 0}}, |
267 | | {// tx_depth 1 |
268 | | {0, 0}, |
269 | | {16, 0}}, |
270 | | {// tx_depth 2 |
271 | | {0, 0}, |
272 | | {8, 0}, |
273 | | {16, 0}, |
274 | | {24, 0}, |
275 | | {0, 8}, |
276 | | {8, 8}, |
277 | | {16, 8}, |
278 | | {24, 8}}}, |
279 | | {// inter |
280 | | {// tx_depth 0 |
281 | | {0, 0}}, |
282 | | {// tx_depth 1 |
283 | | {0, 0}, |
284 | | {16, 0}}, |
285 | | {// tx_depth 2 |
286 | | {0, 0}, |
287 | | {8, 0}, |
288 | | {0, 8}, |
289 | | {8, 8}, |
290 | | {16, 0}, |
291 | | {24, 0}, |
292 | | {16, 8}, |
293 | | {24, 8}}}}, |
294 | | {// BLOCK_32X32 |
295 | | {// intra |
296 | | {// tx_depth 0 |
297 | | {0, 0}}, |
298 | | {// tx_depth 1 |
299 | | {0, 0}, |
300 | | {16, 0}, |
301 | | {0, 16}, |
302 | | {16, 16}}, |
303 | | {// tx_depth 2 |
304 | | {0, 0}, |
305 | | {8, 0}, |
306 | | {16, 0}, |
307 | | {24, 0}, |
308 | | {0, 8}, |
309 | | {8, 8}, |
310 | | {16, 8}, |
311 | | {24, 8}, |
312 | | {0, 16}, |
313 | | {8, 16}, |
314 | | {16, 16}, |
315 | | {24, 16}, |
316 | | {0, 24}, |
317 | | {8, 24}, |
318 | | {16, 24}, |
319 | | {24, 24}}}, |
320 | | {// inter |
321 | | {// tx_depth 0 |
322 | | {0, 0}}, |
323 | | {// tx_depth 1 |
324 | | {0, 0}, |
325 | | {16, 0}, |
326 | | {0, 16}, |
327 | | {16, 16}}, |
328 | | {// tx_depth 2 |
329 | | {0, 0}, |
330 | | {8, 0}, |
331 | | {0, 8}, |
332 | | {8, 8}, |
333 | | {16, 0}, |
334 | | {24, 0}, |
335 | | {16, 8}, |
336 | | {24, 8}, |
337 | | {0, 16}, |
338 | | {8, 16}, |
339 | | {0, 24}, |
340 | | {8, 24}, |
341 | | {16, 16}, |
342 | | {24, 16}, |
343 | | {16, 24}, |
344 | | {24, 24}}}}, |
345 | | {// BLOCK_32X64 |
346 | | {// intra |
347 | | {// tx_depth 0 |
348 | | {0, 0}}, |
349 | | {// tx_depth 1 |
350 | | {0, 0}, |
351 | | {0, 32}}, |
352 | | {// tx_depth 2 |
353 | | {0, 0}, |
354 | | {16, 0}, |
355 | | {0, 16}, |
356 | | {16, 16}, |
357 | | {0, 32}, |
358 | | {16, 32}, |
359 | | {0, 48}, |
360 | | {16, 48}}}, |
361 | | {// inter |
362 | | {// tx_depth 0 |
363 | | {0, 0}}, |
364 | | {// tx_depth 1 |
365 | | {0, 0}, |
366 | | {0, 32}}, |
367 | | {// tx_depth 2 |
368 | | {0, 0}, |
369 | | {16, 0}, |
370 | | {0, 16}, |
371 | | {16, 16}, |
372 | | {0, 32}, |
373 | | {16, 32}, |
374 | | {0, 48}, |
375 | | {16, 48}}}}, |
376 | | {// BLOCK_64X32 |
377 | | {// intra |
378 | | {// tx_depth 0 |
379 | | {0, 0}}, |
380 | | {// tx_depth 1 |
381 | | {0, 0}, |
382 | | {32, 0}}, |
383 | | {// tx_depth 2 |
384 | | {0, 0}, |
385 | | {16, 0}, |
386 | | {32, 0}, |
387 | | {48, 0}, |
388 | | {0, 16}, |
389 | | {16, 16}, |
390 | | {32, 16}, |
391 | | {48, 16}}}, |
392 | | {// inter |
393 | | {// tx_depth 0 |
394 | | {0, 0}}, |
395 | | {// tx_depth 1 |
396 | | {0, 0}, |
397 | | {32, 0}}, |
398 | | {// tx_depth 2 |
399 | | {0, 0}, |
400 | | {16, 0}, |
401 | | {0, 16}, |
402 | | {16, 16}, |
403 | | {32, 0}, |
404 | | {48, 0}, |
405 | | {32, 16}, |
406 | | {48, 16}}}}, |
407 | | {// BLOCK_64X64 |
408 | | {// intra |
409 | | {// tx_depth 0 |
410 | | {0, 0}}, |
411 | | {// tx_depth 1 |
412 | | {0, 0}, |
413 | | {32, 0}, |
414 | | {0, 32}, |
415 | | {32, 32}}, |
416 | | {// tx_depth 2 |
417 | | {0, 0}, |
418 | | {16, 0}, |
419 | | {32, 0}, |
420 | | {48, 0}, |
421 | | {0, 16}, |
422 | | {16, 16}, |
423 | | {32, 16}, |
424 | | {48, 16}, |
425 | | {0, 32}, |
426 | | {16, 32}, |
427 | | {32, 32}, |
428 | | {48, 32}, |
429 | | {0, 48}, |
430 | | {16, 48}, |
431 | | {32, 48}, |
432 | | {48, 48}}}, |
433 | | {// inter |
434 | | {// tx_depth 0 |
435 | | {0, 0}}, |
436 | | {// tx_depth 1 |
437 | | {0, 0}, |
438 | | {32, 0}, |
439 | | {0, 32}, |
440 | | {32, 32}}, |
441 | | {// tx_depth 2 |
442 | | {0, 0}, |
443 | | {16, 0}, |
444 | | {0, 16}, |
445 | | {16, 16}, |
446 | | {32, 0}, |
447 | | {48, 0}, |
448 | | {32, 16}, |
449 | | {48, 16}, |
450 | | {0, 32}, |
451 | | {16, 32}, |
452 | | {0, 48}, |
453 | | {16, 48}, |
454 | | {32, 32}, |
455 | | {48, 32}, |
456 | | {32, 48}, |
457 | | {48, 48}}}}, |
458 | | {// BLOCK_64X128 |
459 | | {// intra |
460 | | {// tx_depth 0 |
461 | | {0, 0}, |
462 | | {0, 64}}, |
463 | | {// tx_depth 1 |
464 | | {0, 0}, |
465 | | {0, 64}}, |
466 | | {// tx_depth 2 |
467 | | {0, 0}, |
468 | | {0, 64}}}, |
469 | | {// inter |
470 | | {// tx_depth 0 |
471 | | {0, 0}, |
472 | | {0, 64}}, |
473 | | {// tx_depth 1 |
474 | | {0, 0}, |
475 | | {0, 64}}, |
476 | | {// tx_depth 2 |
477 | | {0, 0}, |
478 | | {0, 64}}}}, |
479 | | {// BLOCK_128X64 |
480 | | {// intra |
481 | | {// tx_depth 0 |
482 | | {0, 0}, |
483 | | {64, 0}}, |
484 | | {// tx_depth 1 |
485 | | {0, 0}, |
486 | | {64, 0}}, |
487 | | {// tx_depth 2 |
488 | | {0, 0}, |
489 | | {64, 0}}}, |
490 | | {// inter |
491 | | {// tx_depth 0 |
492 | | {0, 0}, |
493 | | {64, 0}}, |
494 | | {// tx_depth 1 |
495 | | {0, 0}, |
496 | | {64, 0}}, |
497 | | {// tx_depth 2 |
498 | | {0, 0}, |
499 | | {64, 0}}}}, |
500 | | {// BLOCK_128X128 |
501 | | {// intra |
502 | | {// tx_depth 0 |
503 | | {0, 0}, |
504 | | {64, 0}, |
505 | | {0, 64}, |
506 | | {64, 64}}, |
507 | | {// tx_depth 1 |
508 | | {0, 0}, |
509 | | {64, 0}, |
510 | | {0, 64}, |
511 | | {64, 64}}, |
512 | | {// tx_depth 2 |
513 | | {0, 0}, |
514 | | {64, 0}, |
515 | | {0, 64}, |
516 | | {64, 64}}}, |
517 | | {// inter |
518 | | {// tx_depth 0 |
519 | | {0, 0}, |
520 | | {64, 0}, |
521 | | {0, 64}, |
522 | | {64, 64}}, |
523 | | {// tx_depth 1 |
524 | | {0, 0}, |
525 | | {64, 0}, |
526 | | {0, 64}, |
527 | | {64, 64}}, |
528 | | {// tx_depth 2 |
529 | | {0, 0}, |
530 | | {64, 0}, |
531 | | {0, 64}, |
532 | | {64, 64}}}}, |
533 | | {// BLOCK_4X16 |
534 | | {// intra |
535 | | {// tx_depth 0 |
536 | | {0, 0}}, |
537 | | {// tx_depth 1 |
538 | | {0, 0}, |
539 | | {0, 8}}, |
540 | | {// tx_depth 2 |
541 | | {0, 0}, |
542 | | {0, 4}, |
543 | | {0, 8}, |
544 | | {0, 12}}}, |
545 | | {// inter |
546 | | {// tx_depth 0 |
547 | | {0, 0}}, |
548 | | {// tx_depth 1 |
549 | | {0, 0}, |
550 | | {0, 8}}, |
551 | | {// tx_depth 2 |
552 | | {0, 0}, |
553 | | {0, 4}, |
554 | | {0, 8}, |
555 | | {0, 12}}}}, |
556 | | {// BLOCK_16X4 |
557 | | {// intra |
558 | | {// tx_depth 0 |
559 | | {0, 0}}, |
560 | | {// tx_depth 1 |
561 | | {0, 0}, |
562 | | {8, 0}}, |
563 | | {// tx_depth 2 |
564 | | {0, 0}, |
565 | | {4, 0}, |
566 | | {8, 0}, |
567 | | {12, 0}}}, |
568 | | {// inter |
569 | | {// tx_depth 0 |
570 | | {0, 0}}, |
571 | | {// tx_depth 1 |
572 | | {0, 0}, |
573 | | {8, 0}}, |
574 | | {// tx_depth 2 |
575 | | {0, 0}, |
576 | | {4, 0}, |
577 | | {8, 0}, |
578 | | {12, 0}}}}, |
579 | | {// BLOCK_8X32 |
580 | | {// intra |
581 | | {// tx_depth 0 |
582 | | {0, 0}}, |
583 | | {// tx_depth 1 |
584 | | {0, 0}, |
585 | | {0, 16}}, |
586 | | {// tx_depth 2 |
587 | | {0, 0}, |
588 | | {0, 8}, |
589 | | {0, 16}, |
590 | | {0, 24}}}, |
591 | | {// inter |
592 | | {// tx_depth 0 |
593 | | {0, 0}}, |
594 | | {// tx_depth 1 |
595 | | {0, 0}, |
596 | | {0, 16}}, |
597 | | {// tx_depth 2 |
598 | | {0, 0}, |
599 | | {0, 8}, |
600 | | {0, 16}, |
601 | | {0, 24}}}}, |
602 | | {// BLOCK_32X8 |
603 | | {// intra |
604 | | {// tx_depth 0 |
605 | | {0, 0}}, |
606 | | {// tx_depth 1 |
607 | | {0, 0}, |
608 | | {16, 0}}, |
609 | | {// tx_depth 2 |
610 | | {0, 0}, |
611 | | {8, 0}, |
612 | | {16, 0}, |
613 | | {24, 0}}}, |
614 | | {// inter |
615 | | {// tx_depth 0 |
616 | | {0, 0}}, |
617 | | {// tx_depth 1 |
618 | | {0, 0}, |
619 | | {16, 0}}, |
620 | | {// tx_depth 2 |
621 | | {0, 0}, |
622 | | {8, 0}, |
623 | | {16, 0}, |
624 | | {24, 0}}}}, |
625 | | {// BLOCK_16X64 |
626 | | {// intra |
627 | | {// tx_depth 0 |
628 | | {0, 0}}, |
629 | | {// tx_depth 1 |
630 | | {0, 0}, |
631 | | {0, 32}}, |
632 | | {// tx_depth 2 |
633 | | {0, 0}, |
634 | | {0, 16}, |
635 | | {0, 32}, |
636 | | {0, 48}}}, |
637 | | {// inter |
638 | | {// tx_depth 0 |
639 | | {0, 0}}, |
640 | | {// tx_depth 1 |
641 | | {0, 0}, |
642 | | {0, 32}}, |
643 | | {// tx_depth 2 |
644 | | {0, 0}, |
645 | | {0, 16}, |
646 | | {0, 32}, |
647 | | {0, 48}}}}, |
648 | | {// BLOCK_64X16 |
649 | | {// intra |
650 | | {// tx_depth 0 |
651 | | {0, 0}}, |
652 | | {// tx_depth 1 |
653 | | {0, 0}, |
654 | | {32, 0}}, |
655 | | {// tx_depth 2 |
656 | | {0, 0}, |
657 | | {16, 0}, |
658 | | {32, 0}, |
659 | | {48, 0}}}, |
660 | | {// inter |
661 | | {// tx_depth 0 |
662 | | {0, 0}}, |
663 | | {// tx_depth 1 |
664 | | {0, 0}, |
665 | | {32, 0}}, |
666 | | {// tx_depth 2 |
667 | | {0, 0}, |
668 | | {16, 0}, |
669 | | {32, 0}, |
670 | | {48, 0}}}}}; |
671 | | |
672 | | static const int8_t fdct4_range_mult2[4] = {0, 2, 3, 3}; |
673 | | static const int8_t fdct8_range_mult2[6] = {0, 2, 4, 5, 5, 5}; |
674 | | static const int8_t fdct16_range_mult2[8] = {0, 2, 4, 6, 7, 7, 7, 7}; |
675 | | static const int8_t fdct32_range_mult2[10] = {0, 2, 4, 6, 8, 9, 9, 9, 9, 9}; |
676 | | static const int8_t fdct64_range_mult2[12] = {0, 2, 4, 6, 8, 10, 11, 11, 11, 11, 11, 11}; |
677 | | static const int8_t fadst4_range_mult2[7] = {0, 2, 4, 3, 3, 3, 3}; |
678 | | static const int8_t fadst8_range_mult2[8] = {0, 0, 1, 3, 3, 5, 5, 5}; |
679 | | static const int8_t fadst16_range_mult2[10] = {0, 0, 1, 3, 3, 5, 5, 7, 7, 7}; |
680 | | static const int8_t fadst32_range_mult2[12] = {0, 0, 1, 3, 3, 5, 5, 7, 7, 9, 9, 9}; |
681 | | static const int8_t fidtx4_range_mult2[1] = {1}; |
682 | | static const int8_t fidtx8_range_mult2[1] = {2}; |
683 | | static const int8_t fidtx16_range_mult2[1] = {3}; |
684 | | static const int8_t fidtx32_range_mult2[1] = {4}; |
685 | | static const int8_t fidtx64_range_mult2[1] = {5}; |
686 | | |
687 | | static const int8_t* fwd_txfm_range_mult2_list[TXFM_TYPES] = {fdct4_range_mult2, |
688 | | fdct8_range_mult2, |
689 | | fdct16_range_mult2, |
690 | | fdct32_range_mult2, |
691 | | fdct64_range_mult2, |
692 | | fadst4_range_mult2, |
693 | | fadst8_range_mult2, |
694 | | fadst16_range_mult2, |
695 | | fadst32_range_mult2, |
696 | | fidtx4_range_mult2, |
697 | | fidtx8_range_mult2, |
698 | | fidtx16_range_mult2, |
699 | | fidtx32_range_mult2, |
700 | | fidtx64_range_mult2}; |
701 | | |
702 | | static const int8_t fwd_shift_4x4[3] = {2, 0, 0}; |
703 | | static const int8_t fwd_shift_8x8[3] = {2, -1, 0}; |
704 | | static const int8_t fwd_shift_16x16[3] = {2, -2, 0}; |
705 | | static const int8_t fwd_shift_32x32[3] = {2, -4, 0}; |
706 | | static const int8_t fwd_shift_64x64[3] = {0, -2, -2}; |
707 | | static const int8_t fwd_shift_4x8[3] = {2, -1, 0}; |
708 | | static const int8_t fwd_shift_8x4[3] = {2, -1, 0}; |
709 | | static const int8_t fwd_shift_8x16[3] = {2, -2, 0}; |
710 | | static const int8_t fwd_shift_16x8[3] = {2, -2, 0}; |
711 | | static const int8_t fwd_shift_16x32[3] = {2, -4, 0}; |
712 | | static const int8_t fwd_shift_32x16[3] = {2, -4, 0}; |
713 | | static const int8_t fwd_shift_32x64[3] = {0, -2, -2}; |
714 | | static const int8_t fwd_shift_64x32[3] = {2, -4, -2}; |
715 | | static const int8_t fwd_shift_4x16[3] = {2, -1, 0}; |
716 | | static const int8_t fwd_shift_16x4[3] = {2, -1, 0}; |
717 | | static const int8_t fwd_shift_8x32[3] = {2, -2, 0}; |
718 | | static const int8_t fwd_shift_32x8[3] = {2, -2, 0}; |
719 | | static const int8_t fwd_shift_16x64[3] = {0, -2, 0}; |
720 | | static const int8_t fwd_shift_64x16[3] = {2, -4, 0}; |
721 | | |
722 | | const int8_t* fwd_txfm_shift_ls[TX_SIZES_ALL] = { |
723 | | fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, |
724 | | fwd_shift_8x16, fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, fwd_shift_64x32, fwd_shift_4x16, |
725 | | fwd_shift_16x4, fwd_shift_8x32, fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16, |
726 | | }; |
727 | | |
728 | | void svt_av1_gen_fwd_stage_range(int8_t* stage_range_col, int8_t* stage_range_row, const Txfm2dFlipCfg* cfg, |
729 | 30.1k | int32_t bd) { |
730 | | // Take the shift from the larger dimension in the rectangular case. |
731 | 30.1k | const int8_t* shift = cfg->shift; |
732 | | // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning |
733 | 262k | for (int32_t i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { |
734 | 232k | stage_range_col[i] = (int8_t)(cfg->stage_range_col[i] + shift[0] + bd + 1); |
735 | 232k | } |
736 | | // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning |
737 | 267k | for (int32_t i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { |
738 | 237k | stage_range_row[i] = (int8_t)(cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1); |
739 | 237k | } |
740 | 30.1k | } |
741 | | |
742 | 44.9k | void svt_av1_fdct4_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
743 | 44.9k | (void)stage_range; |
744 | 44.9k | const int32_t* cospi; |
745 | | |
746 | 44.9k | int32_t *bf0, *bf1; |
747 | 44.9k | int32_t step[4]; |
748 | | |
749 | | // stage 0; |
750 | | |
751 | | // stage 1; |
752 | 44.9k | bf1 = output; |
753 | 44.9k | bf1[0] = input[0] + input[3]; |
754 | 44.9k | bf1[1] = input[1] + input[2]; |
755 | 44.9k | bf1[2] = -input[2] + input[1]; |
756 | 44.9k | bf1[3] = -input[3] + input[0]; |
757 | | |
758 | | // stage 2 |
759 | 44.9k | cospi = cospi_arr(cos_bit); |
760 | 44.9k | bf0 = output; |
761 | 44.9k | bf1 = step; |
762 | 44.9k | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
763 | 44.9k | bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); |
764 | 44.9k | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
765 | 44.9k | bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); |
766 | | |
767 | | // stage 3 |
768 | 44.9k | bf0 = step; |
769 | 44.9k | bf1 = output; |
770 | 44.9k | bf1[0] = bf0[0]; |
771 | 44.9k | bf1[1] = bf0[2]; |
772 | 44.9k | bf1[2] = bf0[1]; |
773 | 44.9k | bf1[3] = bf0[3]; |
774 | 44.9k | } |
775 | | |
776 | 135k | void svt_av1_fdct8_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
777 | 135k | (void)stage_range; |
778 | 135k | const int32_t* cospi; |
779 | | |
780 | 135k | int32_t *bf0, *bf1; |
781 | 135k | int32_t step[8]; |
782 | | |
783 | | // stage 0; |
784 | | |
785 | | // stage 1; |
786 | 135k | bf1 = output; |
787 | 135k | bf1[0] = input[0] + input[7]; |
788 | 135k | bf1[1] = input[1] + input[6]; |
789 | 135k | bf1[2] = input[2] + input[5]; |
790 | 135k | bf1[3] = input[3] + input[4]; |
791 | 135k | bf1[4] = -input[4] + input[3]; |
792 | 135k | bf1[5] = -input[5] + input[2]; |
793 | 135k | bf1[6] = -input[6] + input[1]; |
794 | 135k | bf1[7] = -input[7] + input[0]; |
795 | | |
796 | | // stage 2 |
797 | 135k | cospi = cospi_arr(cos_bit); |
798 | 135k | bf0 = output; |
799 | 135k | bf1 = step; |
800 | 135k | bf1[0] = bf0[0] + bf0[3]; |
801 | 135k | bf1[1] = bf0[1] + bf0[2]; |
802 | 135k | bf1[2] = -bf0[2] + bf0[1]; |
803 | 135k | bf1[3] = -bf0[3] + bf0[0]; |
804 | 135k | bf1[4] = bf0[4]; |
805 | 135k | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
806 | 135k | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
807 | 135k | bf1[7] = bf0[7]; |
808 | | |
809 | | // stage 3 |
810 | 135k | cospi = cospi_arr(cos_bit); |
811 | 135k | bf0 = step; |
812 | 135k | bf1 = output; |
813 | 135k | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
814 | 135k | bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); |
815 | 135k | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
816 | 135k | bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); |
817 | 135k | bf1[4] = bf0[4] + bf0[5]; |
818 | 135k | bf1[5] = -bf0[5] + bf0[4]; |
819 | 135k | bf1[6] = -bf0[6] + bf0[7]; |
820 | 135k | bf1[7] = bf0[7] + bf0[6]; |
821 | | |
822 | | // stage 4 |
823 | 135k | cospi = cospi_arr(cos_bit); |
824 | 135k | bf0 = output; |
825 | 135k | bf1 = step; |
826 | 135k | bf1[0] = bf0[0]; |
827 | 135k | bf1[1] = bf0[1]; |
828 | 135k | bf1[2] = bf0[2]; |
829 | 135k | bf1[3] = bf0[3]; |
830 | 135k | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
831 | 135k | bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); |
832 | 135k | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
833 | 135k | bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); |
834 | | |
835 | | // stage 5 |
836 | 135k | bf0 = step; |
837 | 135k | bf1 = output; |
838 | 135k | bf1[0] = bf0[0]; |
839 | 135k | bf1[1] = bf0[4]; |
840 | 135k | bf1[2] = bf0[2]; |
841 | 135k | bf1[3] = bf0[6]; |
842 | 135k | bf1[4] = bf0[1]; |
843 | 135k | bf1[5] = bf0[5]; |
844 | 135k | bf1[6] = bf0[3]; |
845 | 135k | bf1[7] = bf0[7]; |
846 | 135k | } |
847 | | |
848 | 125k | void svt_av1_fdct16_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
849 | 125k | (void)stage_range; |
850 | 125k | const int32_t* cospi; |
851 | | |
852 | 125k | int32_t *bf0, *bf1; |
853 | 125k | int32_t step[16]; |
854 | | |
855 | | // stage 0; |
856 | | |
857 | | // stage 1; |
858 | 125k | bf1 = output; |
859 | 125k | bf1[0] = input[0] + input[15]; |
860 | 125k | bf1[1] = input[1] + input[14]; |
861 | 125k | bf1[2] = input[2] + input[13]; |
862 | 125k | bf1[3] = input[3] + input[12]; |
863 | 125k | bf1[4] = input[4] + input[11]; |
864 | 125k | bf1[5] = input[5] + input[10]; |
865 | 125k | bf1[6] = input[6] + input[9]; |
866 | 125k | bf1[7] = input[7] + input[8]; |
867 | 125k | bf1[8] = -input[8] + input[7]; |
868 | 125k | bf1[9] = -input[9] + input[6]; |
869 | 125k | bf1[10] = -input[10] + input[5]; |
870 | 125k | bf1[11] = -input[11] + input[4]; |
871 | 125k | bf1[12] = -input[12] + input[3]; |
872 | 125k | bf1[13] = -input[13] + input[2]; |
873 | 125k | bf1[14] = -input[14] + input[1]; |
874 | 125k | bf1[15] = -input[15] + input[0]; |
875 | | |
876 | | // stage 2 |
877 | 125k | cospi = cospi_arr(cos_bit); |
878 | 125k | bf0 = output; |
879 | 125k | bf1 = step; |
880 | 125k | bf1[0] = bf0[0] + bf0[7]; |
881 | 125k | bf1[1] = bf0[1] + bf0[6]; |
882 | 125k | bf1[2] = bf0[2] + bf0[5]; |
883 | 125k | bf1[3] = bf0[3] + bf0[4]; |
884 | 125k | bf1[4] = -bf0[4] + bf0[3]; |
885 | 125k | bf1[5] = -bf0[5] + bf0[2]; |
886 | 125k | bf1[6] = -bf0[6] + bf0[1]; |
887 | 125k | bf1[7] = -bf0[7] + bf0[0]; |
888 | 125k | bf1[8] = bf0[8]; |
889 | 125k | bf1[9] = bf0[9]; |
890 | 125k | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
891 | 125k | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
892 | 125k | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
893 | 125k | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
894 | 125k | bf1[14] = bf0[14]; |
895 | 125k | bf1[15] = bf0[15]; |
896 | | |
897 | | // stage 3 |
898 | 125k | cospi = cospi_arr(cos_bit); |
899 | 125k | bf0 = step; |
900 | 125k | bf1 = output; |
901 | 125k | bf1[0] = bf0[0] + bf0[3]; |
902 | 125k | bf1[1] = bf0[1] + bf0[2]; |
903 | 125k | bf1[2] = -bf0[2] + bf0[1]; |
904 | 125k | bf1[3] = -bf0[3] + bf0[0]; |
905 | 125k | bf1[4] = bf0[4]; |
906 | 125k | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
907 | 125k | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
908 | 125k | bf1[7] = bf0[7]; |
909 | 125k | bf1[8] = bf0[8] + bf0[11]; |
910 | 125k | bf1[9] = bf0[9] + bf0[10]; |
911 | 125k | bf1[10] = -bf0[10] + bf0[9]; |
912 | 125k | bf1[11] = -bf0[11] + bf0[8]; |
913 | 125k | bf1[12] = -bf0[12] + bf0[15]; |
914 | 125k | bf1[13] = -bf0[13] + bf0[14]; |
915 | 125k | bf1[14] = bf0[14] + bf0[13]; |
916 | 125k | bf1[15] = bf0[15] + bf0[12]; |
917 | | |
918 | | // stage 4 |
919 | 125k | cospi = cospi_arr(cos_bit); |
920 | 125k | bf0 = output; |
921 | 125k | bf1 = step; |
922 | 125k | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
923 | 125k | bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); |
924 | 125k | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
925 | 125k | bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); |
926 | 125k | bf1[4] = bf0[4] + bf0[5]; |
927 | 125k | bf1[5] = -bf0[5] + bf0[4]; |
928 | 125k | bf1[6] = -bf0[6] + bf0[7]; |
929 | 125k | bf1[7] = bf0[7] + bf0[6]; |
930 | 125k | bf1[8] = bf0[8]; |
931 | 125k | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
932 | 125k | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
933 | 125k | bf1[11] = bf0[11]; |
934 | 125k | bf1[12] = bf0[12]; |
935 | 125k | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
936 | 125k | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
937 | 125k | bf1[15] = bf0[15]; |
938 | | |
939 | | // stage 5 |
940 | 125k | cospi = cospi_arr(cos_bit); |
941 | 125k | bf0 = step; |
942 | 125k | bf1 = output; |
943 | 125k | bf1[0] = bf0[0]; |
944 | 125k | bf1[1] = bf0[1]; |
945 | 125k | bf1[2] = bf0[2]; |
946 | 125k | bf1[3] = bf0[3]; |
947 | 125k | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
948 | 125k | bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); |
949 | 125k | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
950 | 125k | bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); |
951 | 125k | bf1[8] = bf0[8] + bf0[9]; |
952 | 125k | bf1[9] = -bf0[9] + bf0[8]; |
953 | 125k | bf1[10] = -bf0[10] + bf0[11]; |
954 | 125k | bf1[11] = bf0[11] + bf0[10]; |
955 | 125k | bf1[12] = bf0[12] + bf0[13]; |
956 | 125k | bf1[13] = -bf0[13] + bf0[12]; |
957 | 125k | bf1[14] = -bf0[14] + bf0[15]; |
958 | 125k | bf1[15] = bf0[15] + bf0[14]; |
959 | | |
960 | | // stage 6 |
961 | 125k | cospi = cospi_arr(cos_bit); |
962 | 125k | bf0 = output; |
963 | 125k | bf1 = step; |
964 | 125k | bf1[0] = bf0[0]; |
965 | 125k | bf1[1] = bf0[1]; |
966 | 125k | bf1[2] = bf0[2]; |
967 | 125k | bf1[3] = bf0[3]; |
968 | 125k | bf1[4] = bf0[4]; |
969 | 125k | bf1[5] = bf0[5]; |
970 | 125k | bf1[6] = bf0[6]; |
971 | 125k | bf1[7] = bf0[7]; |
972 | 125k | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
973 | 125k | bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); |
974 | 125k | bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); |
975 | 125k | bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); |
976 | 125k | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
977 | 125k | bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); |
978 | 125k | bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); |
979 | 125k | bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); |
980 | | |
981 | | // stage 7 |
982 | 125k | bf0 = step; |
983 | 125k | bf1 = output; |
984 | 125k | bf1[0] = bf0[0]; |
985 | 125k | bf1[1] = bf0[8]; |
986 | 125k | bf1[2] = bf0[4]; |
987 | 125k | bf1[3] = bf0[12]; |
988 | 125k | bf1[4] = bf0[2]; |
989 | 125k | bf1[5] = bf0[10]; |
990 | 125k | bf1[6] = bf0[6]; |
991 | 125k | bf1[7] = bf0[14]; |
992 | 125k | bf1[8] = bf0[1]; |
993 | 125k | bf1[9] = bf0[9]; |
994 | 125k | bf1[10] = bf0[5]; |
995 | 125k | bf1[11] = bf0[13]; |
996 | 125k | bf1[12] = bf0[3]; |
997 | 125k | bf1[13] = bf0[11]; |
998 | 125k | bf1[14] = bf0[7]; |
999 | 125k | bf1[15] = bf0[15]; |
1000 | 125k | } |
1001 | | |
1002 | 573k | void svt_av1_fdct32_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
1003 | 573k | (void)stage_range; |
1004 | 573k | const int32_t* cospi; |
1005 | | |
1006 | 573k | int32_t *bf0, *bf1; |
1007 | 573k | int32_t step[32]; |
1008 | | |
1009 | | // stage 0; |
1010 | | |
1011 | | // stage 1; |
1012 | 573k | bf1 = output; |
1013 | 573k | bf1[0] = input[0] + input[31]; |
1014 | 573k | bf1[1] = input[1] + input[30]; |
1015 | 573k | bf1[2] = input[2] + input[29]; |
1016 | 573k | bf1[3] = input[3] + input[28]; |
1017 | 573k | bf1[4] = input[4] + input[27]; |
1018 | 573k | bf1[5] = input[5] + input[26]; |
1019 | 573k | bf1[6] = input[6] + input[25]; |
1020 | 573k | bf1[7] = input[7] + input[24]; |
1021 | 573k | bf1[8] = input[8] + input[23]; |
1022 | 573k | bf1[9] = input[9] + input[22]; |
1023 | 573k | bf1[10] = input[10] + input[21]; |
1024 | 573k | bf1[11] = input[11] + input[20]; |
1025 | 573k | bf1[12] = input[12] + input[19]; |
1026 | 573k | bf1[13] = input[13] + input[18]; |
1027 | 573k | bf1[14] = input[14] + input[17]; |
1028 | 573k | bf1[15] = input[15] + input[16]; |
1029 | 573k | bf1[16] = -input[16] + input[15]; |
1030 | 573k | bf1[17] = -input[17] + input[14]; |
1031 | 573k | bf1[18] = -input[18] + input[13]; |
1032 | 573k | bf1[19] = -input[19] + input[12]; |
1033 | 573k | bf1[20] = -input[20] + input[11]; |
1034 | 573k | bf1[21] = -input[21] + input[10]; |
1035 | 573k | bf1[22] = -input[22] + input[9]; |
1036 | 573k | bf1[23] = -input[23] + input[8]; |
1037 | 573k | bf1[24] = -input[24] + input[7]; |
1038 | 573k | bf1[25] = -input[25] + input[6]; |
1039 | 573k | bf1[26] = -input[26] + input[5]; |
1040 | 573k | bf1[27] = -input[27] + input[4]; |
1041 | 573k | bf1[28] = -input[28] + input[3]; |
1042 | 573k | bf1[29] = -input[29] + input[2]; |
1043 | 573k | bf1[30] = -input[30] + input[1]; |
1044 | 573k | bf1[31] = -input[31] + input[0]; |
1045 | | |
1046 | | // stage 2 |
1047 | 573k | cospi = cospi_arr(cos_bit); |
1048 | 573k | bf0 = output; |
1049 | 573k | bf1 = step; |
1050 | 573k | bf1[0] = bf0[0] + bf0[15]; |
1051 | 573k | bf1[1] = bf0[1] + bf0[14]; |
1052 | 573k | bf1[2] = bf0[2] + bf0[13]; |
1053 | 573k | bf1[3] = bf0[3] + bf0[12]; |
1054 | 573k | bf1[4] = bf0[4] + bf0[11]; |
1055 | 573k | bf1[5] = bf0[5] + bf0[10]; |
1056 | 573k | bf1[6] = bf0[6] + bf0[9]; |
1057 | 573k | bf1[7] = bf0[7] + bf0[8]; |
1058 | 573k | bf1[8] = -bf0[8] + bf0[7]; |
1059 | 573k | bf1[9] = -bf0[9] + bf0[6]; |
1060 | 573k | bf1[10] = -bf0[10] + bf0[5]; |
1061 | 573k | bf1[11] = -bf0[11] + bf0[4]; |
1062 | 573k | bf1[12] = -bf0[12] + bf0[3]; |
1063 | 573k | bf1[13] = -bf0[13] + bf0[2]; |
1064 | 573k | bf1[14] = -bf0[14] + bf0[1]; |
1065 | 573k | bf1[15] = -bf0[15] + bf0[0]; |
1066 | 573k | bf1[16] = bf0[16]; |
1067 | 573k | bf1[17] = bf0[17]; |
1068 | 573k | bf1[18] = bf0[18]; |
1069 | 573k | bf1[19] = bf0[19]; |
1070 | 573k | bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); |
1071 | 573k | bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); |
1072 | 573k | bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); |
1073 | 573k | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); |
1074 | 573k | bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); |
1075 | 573k | bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); |
1076 | 573k | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); |
1077 | 573k | bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); |
1078 | 573k | bf1[28] = bf0[28]; |
1079 | 573k | bf1[29] = bf0[29]; |
1080 | 573k | bf1[30] = bf0[30]; |
1081 | 573k | bf1[31] = bf0[31]; |
1082 | | |
1083 | | // stage 3 |
1084 | 573k | cospi = cospi_arr(cos_bit); |
1085 | 573k | bf0 = step; |
1086 | 573k | bf1 = output; |
1087 | 573k | bf1[0] = bf0[0] + bf0[7]; |
1088 | 573k | bf1[1] = bf0[1] + bf0[6]; |
1089 | 573k | bf1[2] = bf0[2] + bf0[5]; |
1090 | 573k | bf1[3] = bf0[3] + bf0[4]; |
1091 | 573k | bf1[4] = -bf0[4] + bf0[3]; |
1092 | 573k | bf1[5] = -bf0[5] + bf0[2]; |
1093 | 573k | bf1[6] = -bf0[6] + bf0[1]; |
1094 | 573k | bf1[7] = -bf0[7] + bf0[0]; |
1095 | 573k | bf1[8] = bf0[8]; |
1096 | 573k | bf1[9] = bf0[9]; |
1097 | 573k | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
1098 | 573k | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
1099 | 573k | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
1100 | 573k | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
1101 | 573k | bf1[14] = bf0[14]; |
1102 | 573k | bf1[15] = bf0[15]; |
1103 | 573k | bf1[16] = bf0[16] + bf0[23]; |
1104 | 573k | bf1[17] = bf0[17] + bf0[22]; |
1105 | 573k | bf1[18] = bf0[18] + bf0[21]; |
1106 | 573k | bf1[19] = bf0[19] + bf0[20]; |
1107 | 573k | bf1[20] = -bf0[20] + bf0[19]; |
1108 | 573k | bf1[21] = -bf0[21] + bf0[18]; |
1109 | 573k | bf1[22] = -bf0[22] + bf0[17]; |
1110 | 573k | bf1[23] = -bf0[23] + bf0[16]; |
1111 | 573k | bf1[24] = -bf0[24] + bf0[31]; |
1112 | 573k | bf1[25] = -bf0[25] + bf0[30]; |
1113 | 573k | bf1[26] = -bf0[26] + bf0[29]; |
1114 | 573k | bf1[27] = -bf0[27] + bf0[28]; |
1115 | 573k | bf1[28] = bf0[28] + bf0[27]; |
1116 | 573k | bf1[29] = bf0[29] + bf0[26]; |
1117 | 573k | bf1[30] = bf0[30] + bf0[25]; |
1118 | 573k | bf1[31] = bf0[31] + bf0[24]; |
1119 | | |
1120 | | // stage 4 |
1121 | 573k | cospi = cospi_arr(cos_bit); |
1122 | 573k | bf0 = output; |
1123 | 573k | bf1 = step; |
1124 | 573k | bf1[0] = bf0[0] + bf0[3]; |
1125 | 573k | bf1[1] = bf0[1] + bf0[2]; |
1126 | 573k | bf1[2] = -bf0[2] + bf0[1]; |
1127 | 573k | bf1[3] = -bf0[3] + bf0[0]; |
1128 | 573k | bf1[4] = bf0[4]; |
1129 | 573k | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
1130 | 573k | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
1131 | 573k | bf1[7] = bf0[7]; |
1132 | 573k | bf1[8] = bf0[8] + bf0[11]; |
1133 | 573k | bf1[9] = bf0[9] + bf0[10]; |
1134 | 573k | bf1[10] = -bf0[10] + bf0[9]; |
1135 | 573k | bf1[11] = -bf0[11] + bf0[8]; |
1136 | 573k | bf1[12] = -bf0[12] + bf0[15]; |
1137 | 573k | bf1[13] = -bf0[13] + bf0[14]; |
1138 | 573k | bf1[14] = bf0[14] + bf0[13]; |
1139 | 573k | bf1[15] = bf0[15] + bf0[12]; |
1140 | 573k | bf1[16] = bf0[16]; |
1141 | 573k | bf1[17] = bf0[17]; |
1142 | 573k | bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); |
1143 | 573k | bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); |
1144 | 573k | bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); |
1145 | 573k | bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); |
1146 | 573k | bf1[22] = bf0[22]; |
1147 | 573k | bf1[23] = bf0[23]; |
1148 | 573k | bf1[24] = bf0[24]; |
1149 | 573k | bf1[25] = bf0[25]; |
1150 | 573k | bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); |
1151 | 573k | bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); |
1152 | 573k | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); |
1153 | 573k | bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); |
1154 | 573k | bf1[30] = bf0[30]; |
1155 | 573k | bf1[31] = bf0[31]; |
1156 | | |
1157 | | // stage 5 |
1158 | 573k | cospi = cospi_arr(cos_bit); |
1159 | 573k | bf0 = step; |
1160 | 573k | bf1 = output; |
1161 | 573k | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
1162 | 573k | bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); |
1163 | 573k | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
1164 | 573k | bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); |
1165 | 573k | bf1[4] = bf0[4] + bf0[5]; |
1166 | 573k | bf1[5] = -bf0[5] + bf0[4]; |
1167 | 573k | bf1[6] = -bf0[6] + bf0[7]; |
1168 | 573k | bf1[7] = bf0[7] + bf0[6]; |
1169 | 573k | bf1[8] = bf0[8]; |
1170 | 573k | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
1171 | 573k | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
1172 | 573k | bf1[11] = bf0[11]; |
1173 | 573k | bf1[12] = bf0[12]; |
1174 | 573k | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
1175 | 573k | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
1176 | 573k | bf1[15] = bf0[15]; |
1177 | 573k | bf1[16] = bf0[16] + bf0[19]; |
1178 | 573k | bf1[17] = bf0[17] + bf0[18]; |
1179 | 573k | bf1[18] = -bf0[18] + bf0[17]; |
1180 | 573k | bf1[19] = -bf0[19] + bf0[16]; |
1181 | 573k | bf1[20] = -bf0[20] + bf0[23]; |
1182 | 573k | bf1[21] = -bf0[21] + bf0[22]; |
1183 | 573k | bf1[22] = bf0[22] + bf0[21]; |
1184 | 573k | bf1[23] = bf0[23] + bf0[20]; |
1185 | 573k | bf1[24] = bf0[24] + bf0[27]; |
1186 | 573k | bf1[25] = bf0[25] + bf0[26]; |
1187 | 573k | bf1[26] = -bf0[26] + bf0[25]; |
1188 | 573k | bf1[27] = -bf0[27] + bf0[24]; |
1189 | 573k | bf1[28] = -bf0[28] + bf0[31]; |
1190 | 573k | bf1[29] = -bf0[29] + bf0[30]; |
1191 | 573k | bf1[30] = bf0[30] + bf0[29]; |
1192 | 573k | bf1[31] = bf0[31] + bf0[28]; |
1193 | | |
1194 | | // stage 6 |
1195 | 573k | cospi = cospi_arr(cos_bit); |
1196 | 573k | bf0 = output; |
1197 | 573k | bf1 = step; |
1198 | 573k | bf1[0] = bf0[0]; |
1199 | 573k | bf1[1] = bf0[1]; |
1200 | 573k | bf1[2] = bf0[2]; |
1201 | 573k | bf1[3] = bf0[3]; |
1202 | 573k | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
1203 | 573k | bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); |
1204 | 573k | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
1205 | 573k | bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); |
1206 | 573k | bf1[8] = bf0[8] + bf0[9]; |
1207 | 573k | bf1[9] = -bf0[9] + bf0[8]; |
1208 | 573k | bf1[10] = -bf0[10] + bf0[11]; |
1209 | 573k | bf1[11] = bf0[11] + bf0[10]; |
1210 | 573k | bf1[12] = bf0[12] + bf0[13]; |
1211 | 573k | bf1[13] = -bf0[13] + bf0[12]; |
1212 | 573k | bf1[14] = -bf0[14] + bf0[15]; |
1213 | 573k | bf1[15] = bf0[15] + bf0[14]; |
1214 | 573k | bf1[16] = bf0[16]; |
1215 | 573k | bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); |
1216 | 573k | bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); |
1217 | 573k | bf1[19] = bf0[19]; |
1218 | 573k | bf1[20] = bf0[20]; |
1219 | 573k | bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); |
1220 | 573k | bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); |
1221 | 573k | bf1[23] = bf0[23]; |
1222 | 573k | bf1[24] = bf0[24]; |
1223 | 573k | bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); |
1224 | 573k | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); |
1225 | 573k | bf1[27] = bf0[27]; |
1226 | 573k | bf1[28] = bf0[28]; |
1227 | 573k | bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); |
1228 | 573k | bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); |
1229 | 573k | bf1[31] = bf0[31]; |
1230 | | |
1231 | | // stage 7 |
1232 | 573k | cospi = cospi_arr(cos_bit); |
1233 | 573k | bf0 = step; |
1234 | 573k | bf1 = output; |
1235 | 573k | bf1[0] = bf0[0]; |
1236 | 573k | bf1[1] = bf0[1]; |
1237 | 573k | bf1[2] = bf0[2]; |
1238 | 573k | bf1[3] = bf0[3]; |
1239 | 573k | bf1[4] = bf0[4]; |
1240 | 573k | bf1[5] = bf0[5]; |
1241 | 573k | bf1[6] = bf0[6]; |
1242 | 573k | bf1[7] = bf0[7]; |
1243 | 573k | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
1244 | 573k | bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); |
1245 | 573k | bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); |
1246 | 573k | bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); |
1247 | 573k | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
1248 | 573k | bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); |
1249 | 573k | bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); |
1250 | 573k | bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); |
1251 | 573k | bf1[16] = bf0[16] + bf0[17]; |
1252 | 573k | bf1[17] = -bf0[17] + bf0[16]; |
1253 | 573k | bf1[18] = -bf0[18] + bf0[19]; |
1254 | 573k | bf1[19] = bf0[19] + bf0[18]; |
1255 | 573k | bf1[20] = bf0[20] + bf0[21]; |
1256 | 573k | bf1[21] = -bf0[21] + bf0[20]; |
1257 | 573k | bf1[22] = -bf0[22] + bf0[23]; |
1258 | 573k | bf1[23] = bf0[23] + bf0[22]; |
1259 | 573k | bf1[24] = bf0[24] + bf0[25]; |
1260 | 573k | bf1[25] = -bf0[25] + bf0[24]; |
1261 | 573k | bf1[26] = -bf0[26] + bf0[27]; |
1262 | 573k | bf1[27] = bf0[27] + bf0[26]; |
1263 | 573k | bf1[28] = bf0[28] + bf0[29]; |
1264 | 573k | bf1[29] = -bf0[29] + bf0[28]; |
1265 | 573k | bf1[30] = -bf0[30] + bf0[31]; |
1266 | 573k | bf1[31] = bf0[31] + bf0[30]; |
1267 | | |
1268 | | // stage 8 |
1269 | 573k | cospi = cospi_arr(cos_bit); |
1270 | 573k | bf0 = output; |
1271 | 573k | bf1 = step; |
1272 | 573k | bf1[0] = bf0[0]; |
1273 | 573k | bf1[1] = bf0[1]; |
1274 | 573k | bf1[2] = bf0[2]; |
1275 | 573k | bf1[3] = bf0[3]; |
1276 | 573k | bf1[4] = bf0[4]; |
1277 | 573k | bf1[5] = bf0[5]; |
1278 | 573k | bf1[6] = bf0[6]; |
1279 | 573k | bf1[7] = bf0[7]; |
1280 | 573k | bf1[8] = bf0[8]; |
1281 | 573k | bf1[9] = bf0[9]; |
1282 | 573k | bf1[10] = bf0[10]; |
1283 | 573k | bf1[11] = bf0[11]; |
1284 | 573k | bf1[12] = bf0[12]; |
1285 | 573k | bf1[13] = bf0[13]; |
1286 | 573k | bf1[14] = bf0[14]; |
1287 | 573k | bf1[15] = bf0[15]; |
1288 | 573k | bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); |
1289 | 573k | bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); |
1290 | 573k | bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); |
1291 | 573k | bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); |
1292 | 573k | bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); |
1293 | 573k | bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); |
1294 | 573k | bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); |
1295 | 573k | bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); |
1296 | 573k | bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); |
1297 | 573k | bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); |
1298 | 573k | bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); |
1299 | 573k | bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); |
1300 | 573k | bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); |
1301 | 573k | bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); |
1302 | 573k | bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); |
1303 | 573k | bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); |
1304 | | |
1305 | | // stage 9 |
1306 | 573k | bf0 = step; |
1307 | 573k | bf1 = output; |
1308 | 573k | bf1[0] = bf0[0]; |
1309 | 573k | bf1[1] = bf0[16]; |
1310 | 573k | bf1[2] = bf0[8]; |
1311 | 573k | bf1[3] = bf0[24]; |
1312 | 573k | bf1[4] = bf0[4]; |
1313 | 573k | bf1[5] = bf0[20]; |
1314 | 573k | bf1[6] = bf0[12]; |
1315 | 573k | bf1[7] = bf0[28]; |
1316 | 573k | bf1[8] = bf0[2]; |
1317 | 573k | bf1[9] = bf0[18]; |
1318 | 573k | bf1[10] = bf0[10]; |
1319 | 573k | bf1[11] = bf0[26]; |
1320 | 573k | bf1[12] = bf0[6]; |
1321 | 573k | bf1[13] = bf0[22]; |
1322 | 573k | bf1[14] = bf0[14]; |
1323 | 573k | bf1[15] = bf0[30]; |
1324 | 573k | bf1[16] = bf0[1]; |
1325 | 573k | bf1[17] = bf0[17]; |
1326 | 573k | bf1[18] = bf0[9]; |
1327 | 573k | bf1[19] = bf0[25]; |
1328 | 573k | bf1[20] = bf0[5]; |
1329 | 573k | bf1[21] = bf0[21]; |
1330 | 573k | bf1[22] = bf0[13]; |
1331 | 573k | bf1[23] = bf0[29]; |
1332 | 573k | bf1[24] = bf0[3]; |
1333 | 573k | bf1[25] = bf0[19]; |
1334 | 573k | bf1[26] = bf0[11]; |
1335 | 573k | bf1[27] = bf0[27]; |
1336 | 573k | bf1[28] = bf0[7]; |
1337 | 573k | bf1[29] = bf0[23]; |
1338 | 573k | bf1[30] = bf0[15]; |
1339 | 573k | bf1[31] = bf0[31]; |
1340 | 573k | } |
1341 | | |
1342 | 485k | void svt_av1_fdct64_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
1343 | 485k | (void)stage_range; |
1344 | 485k | const int32_t* cospi; |
1345 | | |
1346 | 485k | int32_t *bf0, *bf1; |
1347 | 485k | int32_t step[64]; |
1348 | | |
1349 | | // stage 0; |
1350 | | |
1351 | | // stage 1; |
1352 | 485k | bf1 = output; |
1353 | 485k | bf1[0] = input[0] + input[63]; |
1354 | 485k | bf1[1] = input[1] + input[62]; |
1355 | 485k | bf1[2] = input[2] + input[61]; |
1356 | 485k | bf1[3] = input[3] + input[60]; |
1357 | 485k | bf1[4] = input[4] + input[59]; |
1358 | 485k | bf1[5] = input[5] + input[58]; |
1359 | 485k | bf1[6] = input[6] + input[57]; |
1360 | 485k | bf1[7] = input[7] + input[56]; |
1361 | 485k | bf1[8] = input[8] + input[55]; |
1362 | 485k | bf1[9] = input[9] + input[54]; |
1363 | 485k | bf1[10] = input[10] + input[53]; |
1364 | 485k | bf1[11] = input[11] + input[52]; |
1365 | 485k | bf1[12] = input[12] + input[51]; |
1366 | 485k | bf1[13] = input[13] + input[50]; |
1367 | 485k | bf1[14] = input[14] + input[49]; |
1368 | 485k | bf1[15] = input[15] + input[48]; |
1369 | 485k | bf1[16] = input[16] + input[47]; |
1370 | 485k | bf1[17] = input[17] + input[46]; |
1371 | 485k | bf1[18] = input[18] + input[45]; |
1372 | 485k | bf1[19] = input[19] + input[44]; |
1373 | 485k | bf1[20] = input[20] + input[43]; |
1374 | 485k | bf1[21] = input[21] + input[42]; |
1375 | 485k | bf1[22] = input[22] + input[41]; |
1376 | 485k | bf1[23] = input[23] + input[40]; |
1377 | 485k | bf1[24] = input[24] + input[39]; |
1378 | 485k | bf1[25] = input[25] + input[38]; |
1379 | 485k | bf1[26] = input[26] + input[37]; |
1380 | 485k | bf1[27] = input[27] + input[36]; |
1381 | 485k | bf1[28] = input[28] + input[35]; |
1382 | 485k | bf1[29] = input[29] + input[34]; |
1383 | 485k | bf1[30] = input[30] + input[33]; |
1384 | 485k | bf1[31] = input[31] + input[32]; |
1385 | 485k | bf1[32] = -input[32] + input[31]; |
1386 | 485k | bf1[33] = -input[33] + input[30]; |
1387 | 485k | bf1[34] = -input[34] + input[29]; |
1388 | 485k | bf1[35] = -input[35] + input[28]; |
1389 | 485k | bf1[36] = -input[36] + input[27]; |
1390 | 485k | bf1[37] = -input[37] + input[26]; |
1391 | 485k | bf1[38] = -input[38] + input[25]; |
1392 | 485k | bf1[39] = -input[39] + input[24]; |
1393 | 485k | bf1[40] = -input[40] + input[23]; |
1394 | 485k | bf1[41] = -input[41] + input[22]; |
1395 | 485k | bf1[42] = -input[42] + input[21]; |
1396 | 485k | bf1[43] = -input[43] + input[20]; |
1397 | 485k | bf1[44] = -input[44] + input[19]; |
1398 | 485k | bf1[45] = -input[45] + input[18]; |
1399 | 485k | bf1[46] = -input[46] + input[17]; |
1400 | 485k | bf1[47] = -input[47] + input[16]; |
1401 | 485k | bf1[48] = -input[48] + input[15]; |
1402 | 485k | bf1[49] = -input[49] + input[14]; |
1403 | 485k | bf1[50] = -input[50] + input[13]; |
1404 | 485k | bf1[51] = -input[51] + input[12]; |
1405 | 485k | bf1[52] = -input[52] + input[11]; |
1406 | 485k | bf1[53] = -input[53] + input[10]; |
1407 | 485k | bf1[54] = -input[54] + input[9]; |
1408 | 485k | bf1[55] = -input[55] + input[8]; |
1409 | 485k | bf1[56] = -input[56] + input[7]; |
1410 | 485k | bf1[57] = -input[57] + input[6]; |
1411 | 485k | bf1[58] = -input[58] + input[5]; |
1412 | 485k | bf1[59] = -input[59] + input[4]; |
1413 | 485k | bf1[60] = -input[60] + input[3]; |
1414 | 485k | bf1[61] = -input[61] + input[2]; |
1415 | 485k | bf1[62] = -input[62] + input[1]; |
1416 | 485k | bf1[63] = -input[63] + input[0]; |
1417 | | |
1418 | | // stage 2 |
1419 | 485k | cospi = cospi_arr(cos_bit); |
1420 | 485k | bf0 = output; |
1421 | 485k | bf1 = step; |
1422 | 485k | bf1[0] = bf0[0] + bf0[31]; |
1423 | 485k | bf1[1] = bf0[1] + bf0[30]; |
1424 | 485k | bf1[2] = bf0[2] + bf0[29]; |
1425 | 485k | bf1[3] = bf0[3] + bf0[28]; |
1426 | 485k | bf1[4] = bf0[4] + bf0[27]; |
1427 | 485k | bf1[5] = bf0[5] + bf0[26]; |
1428 | 485k | bf1[6] = bf0[6] + bf0[25]; |
1429 | 485k | bf1[7] = bf0[7] + bf0[24]; |
1430 | 485k | bf1[8] = bf0[8] + bf0[23]; |
1431 | 485k | bf1[9] = bf0[9] + bf0[22]; |
1432 | 485k | bf1[10] = bf0[10] + bf0[21]; |
1433 | 485k | bf1[11] = bf0[11] + bf0[20]; |
1434 | 485k | bf1[12] = bf0[12] + bf0[19]; |
1435 | 485k | bf1[13] = bf0[13] + bf0[18]; |
1436 | 485k | bf1[14] = bf0[14] + bf0[17]; |
1437 | 485k | bf1[15] = bf0[15] + bf0[16]; |
1438 | 485k | bf1[16] = -bf0[16] + bf0[15]; |
1439 | 485k | bf1[17] = -bf0[17] + bf0[14]; |
1440 | 485k | bf1[18] = -bf0[18] + bf0[13]; |
1441 | 485k | bf1[19] = -bf0[19] + bf0[12]; |
1442 | 485k | bf1[20] = -bf0[20] + bf0[11]; |
1443 | 485k | bf1[21] = -bf0[21] + bf0[10]; |
1444 | 485k | bf1[22] = -bf0[22] + bf0[9]; |
1445 | 485k | bf1[23] = -bf0[23] + bf0[8]; |
1446 | 485k | bf1[24] = -bf0[24] + bf0[7]; |
1447 | 485k | bf1[25] = -bf0[25] + bf0[6]; |
1448 | 485k | bf1[26] = -bf0[26] + bf0[5]; |
1449 | 485k | bf1[27] = -bf0[27] + bf0[4]; |
1450 | 485k | bf1[28] = -bf0[28] + bf0[3]; |
1451 | 485k | bf1[29] = -bf0[29] + bf0[2]; |
1452 | 485k | bf1[30] = -bf0[30] + bf0[1]; |
1453 | 485k | bf1[31] = -bf0[31] + bf0[0]; |
1454 | 485k | bf1[32] = bf0[32]; |
1455 | 485k | bf1[33] = bf0[33]; |
1456 | 485k | bf1[34] = bf0[34]; |
1457 | 485k | bf1[35] = bf0[35]; |
1458 | 485k | bf1[36] = bf0[36]; |
1459 | 485k | bf1[37] = bf0[37]; |
1460 | 485k | bf1[38] = bf0[38]; |
1461 | 485k | bf1[39] = bf0[39]; |
1462 | 485k | bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); |
1463 | 485k | bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); |
1464 | 485k | bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); |
1465 | 485k | bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); |
1466 | 485k | bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); |
1467 | 485k | bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); |
1468 | 485k | bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); |
1469 | 485k | bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); |
1470 | 485k | bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); |
1471 | 485k | bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); |
1472 | 485k | bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); |
1473 | 485k | bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); |
1474 | 485k | bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); |
1475 | 485k | bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); |
1476 | 485k | bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); |
1477 | 485k | bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); |
1478 | 485k | bf1[56] = bf0[56]; |
1479 | 485k | bf1[57] = bf0[57]; |
1480 | 485k | bf1[58] = bf0[58]; |
1481 | 485k | bf1[59] = bf0[59]; |
1482 | 485k | bf1[60] = bf0[60]; |
1483 | 485k | bf1[61] = bf0[61]; |
1484 | 485k | bf1[62] = bf0[62]; |
1485 | 485k | bf1[63] = bf0[63]; |
1486 | | |
1487 | | // stage 3 |
1488 | 485k | cospi = cospi_arr(cos_bit); |
1489 | 485k | bf0 = step; |
1490 | 485k | bf1 = output; |
1491 | 485k | bf1[0] = bf0[0] + bf0[15]; |
1492 | 485k | bf1[1] = bf0[1] + bf0[14]; |
1493 | 485k | bf1[2] = bf0[2] + bf0[13]; |
1494 | 485k | bf1[3] = bf0[3] + bf0[12]; |
1495 | 485k | bf1[4] = bf0[4] + bf0[11]; |
1496 | 485k | bf1[5] = bf0[5] + bf0[10]; |
1497 | 485k | bf1[6] = bf0[6] + bf0[9]; |
1498 | 485k | bf1[7] = bf0[7] + bf0[8]; |
1499 | 485k | bf1[8] = -bf0[8] + bf0[7]; |
1500 | 485k | bf1[9] = -bf0[9] + bf0[6]; |
1501 | 485k | bf1[10] = -bf0[10] + bf0[5]; |
1502 | 485k | bf1[11] = -bf0[11] + bf0[4]; |
1503 | 485k | bf1[12] = -bf0[12] + bf0[3]; |
1504 | 485k | bf1[13] = -bf0[13] + bf0[2]; |
1505 | 485k | bf1[14] = -bf0[14] + bf0[1]; |
1506 | 485k | bf1[15] = -bf0[15] + bf0[0]; |
1507 | 485k | bf1[16] = bf0[16]; |
1508 | 485k | bf1[17] = bf0[17]; |
1509 | 485k | bf1[18] = bf0[18]; |
1510 | 485k | bf1[19] = bf0[19]; |
1511 | 485k | bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); |
1512 | 485k | bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); |
1513 | 485k | bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); |
1514 | 485k | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); |
1515 | 485k | bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); |
1516 | 485k | bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); |
1517 | 485k | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); |
1518 | 485k | bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); |
1519 | 485k | bf1[28] = bf0[28]; |
1520 | 485k | bf1[29] = bf0[29]; |
1521 | 485k | bf1[30] = bf0[30]; |
1522 | 485k | bf1[31] = bf0[31]; |
1523 | 485k | bf1[32] = bf0[32] + bf0[47]; |
1524 | 485k | bf1[33] = bf0[33] + bf0[46]; |
1525 | 485k | bf1[34] = bf0[34] + bf0[45]; |
1526 | 485k | bf1[35] = bf0[35] + bf0[44]; |
1527 | 485k | bf1[36] = bf0[36] + bf0[43]; |
1528 | 485k | bf1[37] = bf0[37] + bf0[42]; |
1529 | 485k | bf1[38] = bf0[38] + bf0[41]; |
1530 | 485k | bf1[39] = bf0[39] + bf0[40]; |
1531 | 485k | bf1[40] = -bf0[40] + bf0[39]; |
1532 | 485k | bf1[41] = -bf0[41] + bf0[38]; |
1533 | 485k | bf1[42] = -bf0[42] + bf0[37]; |
1534 | 485k | bf1[43] = -bf0[43] + bf0[36]; |
1535 | 485k | bf1[44] = -bf0[44] + bf0[35]; |
1536 | 485k | bf1[45] = -bf0[45] + bf0[34]; |
1537 | 485k | bf1[46] = -bf0[46] + bf0[33]; |
1538 | 485k | bf1[47] = -bf0[47] + bf0[32]; |
1539 | 485k | bf1[48] = -bf0[48] + bf0[63]; |
1540 | 485k | bf1[49] = -bf0[49] + bf0[62]; |
1541 | 485k | bf1[50] = -bf0[50] + bf0[61]; |
1542 | 485k | bf1[51] = -bf0[51] + bf0[60]; |
1543 | 485k | bf1[52] = -bf0[52] + bf0[59]; |
1544 | 485k | bf1[53] = -bf0[53] + bf0[58]; |
1545 | 485k | bf1[54] = -bf0[54] + bf0[57]; |
1546 | 485k | bf1[55] = -bf0[55] + bf0[56]; |
1547 | 485k | bf1[56] = bf0[56] + bf0[55]; |
1548 | 485k | bf1[57] = bf0[57] + bf0[54]; |
1549 | 485k | bf1[58] = bf0[58] + bf0[53]; |
1550 | 485k | bf1[59] = bf0[59] + bf0[52]; |
1551 | 485k | bf1[60] = bf0[60] + bf0[51]; |
1552 | 485k | bf1[61] = bf0[61] + bf0[50]; |
1553 | 485k | bf1[62] = bf0[62] + bf0[49]; |
1554 | 485k | bf1[63] = bf0[63] + bf0[48]; |
1555 | | |
1556 | | // stage 4 |
1557 | 485k | cospi = cospi_arr(cos_bit); |
1558 | 485k | bf0 = output; |
1559 | 485k | bf1 = step; |
1560 | 485k | bf1[0] = bf0[0] + bf0[7]; |
1561 | 485k | bf1[1] = bf0[1] + bf0[6]; |
1562 | 485k | bf1[2] = bf0[2] + bf0[5]; |
1563 | 485k | bf1[3] = bf0[3] + bf0[4]; |
1564 | 485k | bf1[4] = -bf0[4] + bf0[3]; |
1565 | 485k | bf1[5] = -bf0[5] + bf0[2]; |
1566 | 485k | bf1[6] = -bf0[6] + bf0[1]; |
1567 | 485k | bf1[7] = -bf0[7] + bf0[0]; |
1568 | 485k | bf1[8] = bf0[8]; |
1569 | 485k | bf1[9] = bf0[9]; |
1570 | 485k | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
1571 | 485k | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
1572 | 485k | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
1573 | 485k | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
1574 | 485k | bf1[14] = bf0[14]; |
1575 | 485k | bf1[15] = bf0[15]; |
1576 | 485k | bf1[16] = bf0[16] + bf0[23]; |
1577 | 485k | bf1[17] = bf0[17] + bf0[22]; |
1578 | 485k | bf1[18] = bf0[18] + bf0[21]; |
1579 | 485k | bf1[19] = bf0[19] + bf0[20]; |
1580 | 485k | bf1[20] = -bf0[20] + bf0[19]; |
1581 | 485k | bf1[21] = -bf0[21] + bf0[18]; |
1582 | 485k | bf1[22] = -bf0[22] + bf0[17]; |
1583 | 485k | bf1[23] = -bf0[23] + bf0[16]; |
1584 | 485k | bf1[24] = -bf0[24] + bf0[31]; |
1585 | 485k | bf1[25] = -bf0[25] + bf0[30]; |
1586 | 485k | bf1[26] = -bf0[26] + bf0[29]; |
1587 | 485k | bf1[27] = -bf0[27] + bf0[28]; |
1588 | 485k | bf1[28] = bf0[28] + bf0[27]; |
1589 | 485k | bf1[29] = bf0[29] + bf0[26]; |
1590 | 485k | bf1[30] = bf0[30] + bf0[25]; |
1591 | 485k | bf1[31] = bf0[31] + bf0[24]; |
1592 | 485k | bf1[32] = bf0[32]; |
1593 | 485k | bf1[33] = bf0[33]; |
1594 | 485k | bf1[34] = bf0[34]; |
1595 | 485k | bf1[35] = bf0[35]; |
1596 | 485k | bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); |
1597 | 485k | bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); |
1598 | 485k | bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); |
1599 | 485k | bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); |
1600 | 485k | bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); |
1601 | 485k | bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); |
1602 | 485k | bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); |
1603 | 485k | bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); |
1604 | 485k | bf1[44] = bf0[44]; |
1605 | 485k | bf1[45] = bf0[45]; |
1606 | 485k | bf1[46] = bf0[46]; |
1607 | 485k | bf1[47] = bf0[47]; |
1608 | 485k | bf1[48] = bf0[48]; |
1609 | 485k | bf1[49] = bf0[49]; |
1610 | 485k | bf1[50] = bf0[50]; |
1611 | 485k | bf1[51] = bf0[51]; |
1612 | 485k | bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); |
1613 | 485k | bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); |
1614 | 485k | bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); |
1615 | 485k | bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); |
1616 | 485k | bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); |
1617 | 485k | bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); |
1618 | 485k | bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); |
1619 | 485k | bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); |
1620 | 485k | bf1[60] = bf0[60]; |
1621 | 485k | bf1[61] = bf0[61]; |
1622 | 485k | bf1[62] = bf0[62]; |
1623 | 485k | bf1[63] = bf0[63]; |
1624 | | |
1625 | | // stage 5 |
1626 | 485k | cospi = cospi_arr(cos_bit); |
1627 | 485k | bf0 = step; |
1628 | 485k | bf1 = output; |
1629 | 485k | bf1[0] = bf0[0] + bf0[3]; |
1630 | 485k | bf1[1] = bf0[1] + bf0[2]; |
1631 | 485k | bf1[2] = -bf0[2] + bf0[1]; |
1632 | 485k | bf1[3] = -bf0[3] + bf0[0]; |
1633 | 485k | bf1[4] = bf0[4]; |
1634 | 485k | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
1635 | 485k | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
1636 | 485k | bf1[7] = bf0[7]; |
1637 | 485k | bf1[8] = bf0[8] + bf0[11]; |
1638 | 485k | bf1[9] = bf0[9] + bf0[10]; |
1639 | 485k | bf1[10] = -bf0[10] + bf0[9]; |
1640 | 485k | bf1[11] = -bf0[11] + bf0[8]; |
1641 | 485k | bf1[12] = -bf0[12] + bf0[15]; |
1642 | 485k | bf1[13] = -bf0[13] + bf0[14]; |
1643 | 485k | bf1[14] = bf0[14] + bf0[13]; |
1644 | 485k | bf1[15] = bf0[15] + bf0[12]; |
1645 | 485k | bf1[16] = bf0[16]; |
1646 | 485k | bf1[17] = bf0[17]; |
1647 | 485k | bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); |
1648 | 485k | bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); |
1649 | 485k | bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); |
1650 | 485k | bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); |
1651 | 485k | bf1[22] = bf0[22]; |
1652 | 485k | bf1[23] = bf0[23]; |
1653 | 485k | bf1[24] = bf0[24]; |
1654 | 485k | bf1[25] = bf0[25]; |
1655 | 485k | bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); |
1656 | 485k | bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); |
1657 | 485k | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); |
1658 | 485k | bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); |
1659 | 485k | bf1[30] = bf0[30]; |
1660 | 485k | bf1[31] = bf0[31]; |
1661 | 485k | bf1[32] = bf0[32] + bf0[39]; |
1662 | 485k | bf1[33] = bf0[33] + bf0[38]; |
1663 | 485k | bf1[34] = bf0[34] + bf0[37]; |
1664 | 485k | bf1[35] = bf0[35] + bf0[36]; |
1665 | 485k | bf1[36] = -bf0[36] + bf0[35]; |
1666 | 485k | bf1[37] = -bf0[37] + bf0[34]; |
1667 | 485k | bf1[38] = -bf0[38] + bf0[33]; |
1668 | 485k | bf1[39] = -bf0[39] + bf0[32]; |
1669 | 485k | bf1[40] = -bf0[40] + bf0[47]; |
1670 | 485k | bf1[41] = -bf0[41] + bf0[46]; |
1671 | 485k | bf1[42] = -bf0[42] + bf0[45]; |
1672 | 485k | bf1[43] = -bf0[43] + bf0[44]; |
1673 | 485k | bf1[44] = bf0[44] + bf0[43]; |
1674 | 485k | bf1[45] = bf0[45] + bf0[42]; |
1675 | 485k | bf1[46] = bf0[46] + bf0[41]; |
1676 | 485k | bf1[47] = bf0[47] + bf0[40]; |
1677 | 485k | bf1[48] = bf0[48] + bf0[55]; |
1678 | 485k | bf1[49] = bf0[49] + bf0[54]; |
1679 | 485k | bf1[50] = bf0[50] + bf0[53]; |
1680 | 485k | bf1[51] = bf0[51] + bf0[52]; |
1681 | 485k | bf1[52] = -bf0[52] + bf0[51]; |
1682 | 485k | bf1[53] = -bf0[53] + bf0[50]; |
1683 | 485k | bf1[54] = -bf0[54] + bf0[49]; |
1684 | 485k | bf1[55] = -bf0[55] + bf0[48]; |
1685 | 485k | bf1[56] = -bf0[56] + bf0[63]; |
1686 | 485k | bf1[57] = -bf0[57] + bf0[62]; |
1687 | 485k | bf1[58] = -bf0[58] + bf0[61]; |
1688 | 485k | bf1[59] = -bf0[59] + bf0[60]; |
1689 | 485k | bf1[60] = bf0[60] + bf0[59]; |
1690 | 485k | bf1[61] = bf0[61] + bf0[58]; |
1691 | 485k | bf1[62] = bf0[62] + bf0[57]; |
1692 | 485k | bf1[63] = bf0[63] + bf0[56]; |
1693 | | |
1694 | | // stage 6 |
1695 | 485k | cospi = cospi_arr(cos_bit); |
1696 | 485k | bf0 = output; |
1697 | 485k | bf1 = step; |
1698 | 485k | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
1699 | 485k | bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); |
1700 | 485k | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
1701 | 485k | bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); |
1702 | 485k | bf1[4] = bf0[4] + bf0[5]; |
1703 | 485k | bf1[5] = -bf0[5] + bf0[4]; |
1704 | 485k | bf1[6] = -bf0[6] + bf0[7]; |
1705 | 485k | bf1[7] = bf0[7] + bf0[6]; |
1706 | 485k | bf1[8] = bf0[8]; |
1707 | 485k | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
1708 | 485k | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
1709 | 485k | bf1[11] = bf0[11]; |
1710 | 485k | bf1[12] = bf0[12]; |
1711 | 485k | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
1712 | 485k | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
1713 | 485k | bf1[15] = bf0[15]; |
1714 | 485k | bf1[16] = bf0[16] + bf0[19]; |
1715 | 485k | bf1[17] = bf0[17] + bf0[18]; |
1716 | 485k | bf1[18] = -bf0[18] + bf0[17]; |
1717 | 485k | bf1[19] = -bf0[19] + bf0[16]; |
1718 | 485k | bf1[20] = -bf0[20] + bf0[23]; |
1719 | 485k | bf1[21] = -bf0[21] + bf0[22]; |
1720 | 485k | bf1[22] = bf0[22] + bf0[21]; |
1721 | 485k | bf1[23] = bf0[23] + bf0[20]; |
1722 | 485k | bf1[24] = bf0[24] + bf0[27]; |
1723 | 485k | bf1[25] = bf0[25] + bf0[26]; |
1724 | 485k | bf1[26] = -bf0[26] + bf0[25]; |
1725 | 485k | bf1[27] = -bf0[27] + bf0[24]; |
1726 | 485k | bf1[28] = -bf0[28] + bf0[31]; |
1727 | 485k | bf1[29] = -bf0[29] + bf0[30]; |
1728 | 485k | bf1[30] = bf0[30] + bf0[29]; |
1729 | 485k | bf1[31] = bf0[31] + bf0[28]; |
1730 | 485k | bf1[32] = bf0[32]; |
1731 | 485k | bf1[33] = bf0[33]; |
1732 | 485k | bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); |
1733 | 485k | bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); |
1734 | 485k | bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); |
1735 | 485k | bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); |
1736 | 485k | bf1[38] = bf0[38]; |
1737 | 485k | bf1[39] = bf0[39]; |
1738 | 485k | bf1[40] = bf0[40]; |
1739 | 485k | bf1[41] = bf0[41]; |
1740 | 485k | bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); |
1741 | 485k | bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); |
1742 | 485k | bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); |
1743 | 485k | bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); |
1744 | 485k | bf1[46] = bf0[46]; |
1745 | 485k | bf1[47] = bf0[47]; |
1746 | 485k | bf1[48] = bf0[48]; |
1747 | 485k | bf1[49] = bf0[49]; |
1748 | 485k | bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); |
1749 | 485k | bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); |
1750 | 485k | bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); |
1751 | 485k | bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); |
1752 | 485k | bf1[54] = bf0[54]; |
1753 | 485k | bf1[55] = bf0[55]; |
1754 | 485k | bf1[56] = bf0[56]; |
1755 | 485k | bf1[57] = bf0[57]; |
1756 | 485k | bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); |
1757 | 485k | bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); |
1758 | 485k | bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); |
1759 | 485k | bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); |
1760 | 485k | bf1[62] = bf0[62]; |
1761 | 485k | bf1[63] = bf0[63]; |
1762 | | |
1763 | | // stage 7 |
1764 | 485k | cospi = cospi_arr(cos_bit); |
1765 | 485k | bf0 = step; |
1766 | 485k | bf1 = output; |
1767 | 485k | bf1[0] = bf0[0]; |
1768 | 485k | bf1[1] = bf0[1]; |
1769 | 485k | bf1[2] = bf0[2]; |
1770 | 485k | bf1[3] = bf0[3]; |
1771 | 485k | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
1772 | 485k | bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); |
1773 | 485k | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
1774 | 485k | bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); |
1775 | 485k | bf1[8] = bf0[8] + bf0[9]; |
1776 | 485k | bf1[9] = -bf0[9] + bf0[8]; |
1777 | 485k | bf1[10] = -bf0[10] + bf0[11]; |
1778 | 485k | bf1[11] = bf0[11] + bf0[10]; |
1779 | 485k | bf1[12] = bf0[12] + bf0[13]; |
1780 | 485k | bf1[13] = -bf0[13] + bf0[12]; |
1781 | 485k | bf1[14] = -bf0[14] + bf0[15]; |
1782 | 485k | bf1[15] = bf0[15] + bf0[14]; |
1783 | 485k | bf1[16] = bf0[16]; |
1784 | 485k | bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); |
1785 | 485k | bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); |
1786 | 485k | bf1[19] = bf0[19]; |
1787 | 485k | bf1[20] = bf0[20]; |
1788 | 485k | bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); |
1789 | 485k | bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); |
1790 | 485k | bf1[23] = bf0[23]; |
1791 | 485k | bf1[24] = bf0[24]; |
1792 | 485k | bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); |
1793 | 485k | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); |
1794 | 485k | bf1[27] = bf0[27]; |
1795 | 485k | bf1[28] = bf0[28]; |
1796 | 485k | bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); |
1797 | 485k | bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); |
1798 | 485k | bf1[31] = bf0[31]; |
1799 | 485k | bf1[32] = bf0[32] + bf0[35]; |
1800 | 485k | bf1[33] = bf0[33] + bf0[34]; |
1801 | 485k | bf1[34] = -bf0[34] + bf0[33]; |
1802 | 485k | bf1[35] = -bf0[35] + bf0[32]; |
1803 | 485k | bf1[36] = -bf0[36] + bf0[39]; |
1804 | 485k | bf1[37] = -bf0[37] + bf0[38]; |
1805 | 485k | bf1[38] = bf0[38] + bf0[37]; |
1806 | 485k | bf1[39] = bf0[39] + bf0[36]; |
1807 | 485k | bf1[40] = bf0[40] + bf0[43]; |
1808 | 485k | bf1[41] = bf0[41] + bf0[42]; |
1809 | 485k | bf1[42] = -bf0[42] + bf0[41]; |
1810 | 485k | bf1[43] = -bf0[43] + bf0[40]; |
1811 | 485k | bf1[44] = -bf0[44] + bf0[47]; |
1812 | 485k | bf1[45] = -bf0[45] + bf0[46]; |
1813 | 485k | bf1[46] = bf0[46] + bf0[45]; |
1814 | 485k | bf1[47] = bf0[47] + bf0[44]; |
1815 | 485k | bf1[48] = bf0[48] + bf0[51]; |
1816 | 485k | bf1[49] = bf0[49] + bf0[50]; |
1817 | 485k | bf1[50] = -bf0[50] + bf0[49]; |
1818 | 485k | bf1[51] = -bf0[51] + bf0[48]; |
1819 | 485k | bf1[52] = -bf0[52] + bf0[55]; |
1820 | 485k | bf1[53] = -bf0[53] + bf0[54]; |
1821 | 485k | bf1[54] = bf0[54] + bf0[53]; |
1822 | 485k | bf1[55] = bf0[55] + bf0[52]; |
1823 | 485k | bf1[56] = bf0[56] + bf0[59]; |
1824 | 485k | bf1[57] = bf0[57] + bf0[58]; |
1825 | 485k | bf1[58] = -bf0[58] + bf0[57]; |
1826 | 485k | bf1[59] = -bf0[59] + bf0[56]; |
1827 | 485k | bf1[60] = -bf0[60] + bf0[63]; |
1828 | 485k | bf1[61] = -bf0[61] + bf0[62]; |
1829 | 485k | bf1[62] = bf0[62] + bf0[61]; |
1830 | 485k | bf1[63] = bf0[63] + bf0[60]; |
1831 | | |
1832 | | // stage 8 |
1833 | 485k | cospi = cospi_arr(cos_bit); |
1834 | 485k | bf0 = output; |
1835 | 485k | bf1 = step; |
1836 | 485k | bf1[0] = bf0[0]; |
1837 | 485k | bf1[1] = bf0[1]; |
1838 | 485k | bf1[2] = bf0[2]; |
1839 | 485k | bf1[3] = bf0[3]; |
1840 | 485k | bf1[4] = bf0[4]; |
1841 | 485k | bf1[5] = bf0[5]; |
1842 | 485k | bf1[6] = bf0[6]; |
1843 | 485k | bf1[7] = bf0[7]; |
1844 | 485k | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
1845 | 485k | bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); |
1846 | 485k | bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); |
1847 | 485k | bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); |
1848 | 485k | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
1849 | 485k | bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); |
1850 | 485k | bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); |
1851 | 485k | bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); |
1852 | 485k | bf1[16] = bf0[16] + bf0[17]; |
1853 | 485k | bf1[17] = -bf0[17] + bf0[16]; |
1854 | 485k | bf1[18] = -bf0[18] + bf0[19]; |
1855 | 485k | bf1[19] = bf0[19] + bf0[18]; |
1856 | 485k | bf1[20] = bf0[20] + bf0[21]; |
1857 | 485k | bf1[21] = -bf0[21] + bf0[20]; |
1858 | 485k | bf1[22] = -bf0[22] + bf0[23]; |
1859 | 485k | bf1[23] = bf0[23] + bf0[22]; |
1860 | 485k | bf1[24] = bf0[24] + bf0[25]; |
1861 | 485k | bf1[25] = -bf0[25] + bf0[24]; |
1862 | 485k | bf1[26] = -bf0[26] + bf0[27]; |
1863 | 485k | bf1[27] = bf0[27] + bf0[26]; |
1864 | 485k | bf1[28] = bf0[28] + bf0[29]; |
1865 | 485k | bf1[29] = -bf0[29] + bf0[28]; |
1866 | 485k | bf1[30] = -bf0[30] + bf0[31]; |
1867 | 485k | bf1[31] = bf0[31] + bf0[30]; |
1868 | 485k | bf1[32] = bf0[32]; |
1869 | 485k | bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); |
1870 | 485k | bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); |
1871 | 485k | bf1[35] = bf0[35]; |
1872 | 485k | bf1[36] = bf0[36]; |
1873 | 485k | bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); |
1874 | 485k | bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); |
1875 | 485k | bf1[39] = bf0[39]; |
1876 | 485k | bf1[40] = bf0[40]; |
1877 | 485k | bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); |
1878 | 485k | bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); |
1879 | 485k | bf1[43] = bf0[43]; |
1880 | 485k | bf1[44] = bf0[44]; |
1881 | 485k | bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); |
1882 | 485k | bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); |
1883 | 485k | bf1[47] = bf0[47]; |
1884 | 485k | bf1[48] = bf0[48]; |
1885 | 485k | bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); |
1886 | 485k | bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); |
1887 | 485k | bf1[51] = bf0[51]; |
1888 | 485k | bf1[52] = bf0[52]; |
1889 | 485k | bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); |
1890 | 485k | bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); |
1891 | 485k | bf1[55] = bf0[55]; |
1892 | 485k | bf1[56] = bf0[56]; |
1893 | 485k | bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); |
1894 | 485k | bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); |
1895 | 485k | bf1[59] = bf0[59]; |
1896 | 485k | bf1[60] = bf0[60]; |
1897 | 485k | bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); |
1898 | 485k | bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); |
1899 | 485k | bf1[63] = bf0[63]; |
1900 | | |
1901 | | // stage 9 |
1902 | 485k | cospi = cospi_arr(cos_bit); |
1903 | 485k | bf0 = step; |
1904 | 485k | bf1 = output; |
1905 | 485k | bf1[0] = bf0[0]; |
1906 | 485k | bf1[1] = bf0[1]; |
1907 | 485k | bf1[2] = bf0[2]; |
1908 | 485k | bf1[3] = bf0[3]; |
1909 | 485k | bf1[4] = bf0[4]; |
1910 | 485k | bf1[5] = bf0[5]; |
1911 | 485k | bf1[6] = bf0[6]; |
1912 | 485k | bf1[7] = bf0[7]; |
1913 | 485k | bf1[8] = bf0[8]; |
1914 | 485k | bf1[9] = bf0[9]; |
1915 | 485k | bf1[10] = bf0[10]; |
1916 | 485k | bf1[11] = bf0[11]; |
1917 | 485k | bf1[12] = bf0[12]; |
1918 | 485k | bf1[13] = bf0[13]; |
1919 | 485k | bf1[14] = bf0[14]; |
1920 | 485k | bf1[15] = bf0[15]; |
1921 | 485k | bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); |
1922 | 485k | bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); |
1923 | 485k | bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); |
1924 | 485k | bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); |
1925 | 485k | bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); |
1926 | 485k | bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); |
1927 | 485k | bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); |
1928 | 485k | bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); |
1929 | 485k | bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); |
1930 | 485k | bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); |
1931 | 485k | bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); |
1932 | 485k | bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); |
1933 | 485k | bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); |
1934 | 485k | bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); |
1935 | 485k | bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); |
1936 | 485k | bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); |
1937 | 485k | bf1[32] = bf0[32] + bf0[33]; |
1938 | 485k | bf1[33] = -bf0[33] + bf0[32]; |
1939 | 485k | bf1[34] = -bf0[34] + bf0[35]; |
1940 | 485k | bf1[35] = bf0[35] + bf0[34]; |
1941 | 485k | bf1[36] = bf0[36] + bf0[37]; |
1942 | 485k | bf1[37] = -bf0[37] + bf0[36]; |
1943 | 485k | bf1[38] = -bf0[38] + bf0[39]; |
1944 | 485k | bf1[39] = bf0[39] + bf0[38]; |
1945 | 485k | bf1[40] = bf0[40] + bf0[41]; |
1946 | 485k | bf1[41] = -bf0[41] + bf0[40]; |
1947 | 485k | bf1[42] = -bf0[42] + bf0[43]; |
1948 | 485k | bf1[43] = bf0[43] + bf0[42]; |
1949 | 485k | bf1[44] = bf0[44] + bf0[45]; |
1950 | 485k | bf1[45] = -bf0[45] + bf0[44]; |
1951 | 485k | bf1[46] = -bf0[46] + bf0[47]; |
1952 | 485k | bf1[47] = bf0[47] + bf0[46]; |
1953 | 485k | bf1[48] = bf0[48] + bf0[49]; |
1954 | 485k | bf1[49] = -bf0[49] + bf0[48]; |
1955 | 485k | bf1[50] = -bf0[50] + bf0[51]; |
1956 | 485k | bf1[51] = bf0[51] + bf0[50]; |
1957 | 485k | bf1[52] = bf0[52] + bf0[53]; |
1958 | 485k | bf1[53] = -bf0[53] + bf0[52]; |
1959 | 485k | bf1[54] = -bf0[54] + bf0[55]; |
1960 | 485k | bf1[55] = bf0[55] + bf0[54]; |
1961 | 485k | bf1[56] = bf0[56] + bf0[57]; |
1962 | 485k | bf1[57] = -bf0[57] + bf0[56]; |
1963 | 485k | bf1[58] = -bf0[58] + bf0[59]; |
1964 | 485k | bf1[59] = bf0[59] + bf0[58]; |
1965 | 485k | bf1[60] = bf0[60] + bf0[61]; |
1966 | 485k | bf1[61] = -bf0[61] + bf0[60]; |
1967 | 485k | bf1[62] = -bf0[62] + bf0[63]; |
1968 | 485k | bf1[63] = bf0[63] + bf0[62]; |
1969 | | |
1970 | | // stage 10 |
1971 | 485k | cospi = cospi_arr(cos_bit); |
1972 | 485k | bf0 = output; |
1973 | 485k | bf1 = step; |
1974 | 485k | bf1[0] = bf0[0]; |
1975 | 485k | bf1[1] = bf0[1]; |
1976 | 485k | bf1[2] = bf0[2]; |
1977 | 485k | bf1[3] = bf0[3]; |
1978 | 485k | bf1[4] = bf0[4]; |
1979 | 485k | bf1[5] = bf0[5]; |
1980 | 485k | bf1[6] = bf0[6]; |
1981 | 485k | bf1[7] = bf0[7]; |
1982 | 485k | bf1[8] = bf0[8]; |
1983 | 485k | bf1[9] = bf0[9]; |
1984 | 485k | bf1[10] = bf0[10]; |
1985 | 485k | bf1[11] = bf0[11]; |
1986 | 485k | bf1[12] = bf0[12]; |
1987 | 485k | bf1[13] = bf0[13]; |
1988 | 485k | bf1[14] = bf0[14]; |
1989 | 485k | bf1[15] = bf0[15]; |
1990 | 485k | bf1[16] = bf0[16]; |
1991 | 485k | bf1[17] = bf0[17]; |
1992 | 485k | bf1[18] = bf0[18]; |
1993 | 485k | bf1[19] = bf0[19]; |
1994 | 485k | bf1[20] = bf0[20]; |
1995 | 485k | bf1[21] = bf0[21]; |
1996 | 485k | bf1[22] = bf0[22]; |
1997 | 485k | bf1[23] = bf0[23]; |
1998 | 485k | bf1[24] = bf0[24]; |
1999 | 485k | bf1[25] = bf0[25]; |
2000 | 485k | bf1[26] = bf0[26]; |
2001 | 485k | bf1[27] = bf0[27]; |
2002 | 485k | bf1[28] = bf0[28]; |
2003 | 485k | bf1[29] = bf0[29]; |
2004 | 485k | bf1[30] = bf0[30]; |
2005 | 485k | bf1[31] = bf0[31]; |
2006 | 485k | bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); |
2007 | 485k | bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit); |
2008 | 485k | bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); |
2009 | 485k | bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit); |
2010 | 485k | bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); |
2011 | 485k | bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit); |
2012 | 485k | bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); |
2013 | 485k | bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit); |
2014 | 485k | bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); |
2015 | 485k | bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit); |
2016 | 485k | bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); |
2017 | 485k | bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit); |
2018 | 485k | bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); |
2019 | 485k | bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit); |
2020 | 485k | bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); |
2021 | 485k | bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit); |
2022 | 485k | bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); |
2023 | 485k | bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit); |
2024 | 485k | bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); |
2025 | 485k | bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit); |
2026 | 485k | bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); |
2027 | 485k | bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit); |
2028 | 485k | bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); |
2029 | 485k | bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit); |
2030 | 485k | bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); |
2031 | 485k | bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit); |
2032 | 485k | bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); |
2033 | 485k | bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit); |
2034 | 485k | bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); |
2035 | 485k | bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); |
2036 | 485k | bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); |
2037 | 485k | bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); |
2038 | | |
2039 | | // stage 11 |
2040 | 485k | bf0 = step; |
2041 | 485k | bf1 = output; |
2042 | 485k | bf1[0] = bf0[0]; |
2043 | 485k | bf1[1] = bf0[32]; |
2044 | 485k | bf1[2] = bf0[16]; |
2045 | 485k | bf1[3] = bf0[48]; |
2046 | 485k | bf1[4] = bf0[8]; |
2047 | 485k | bf1[5] = bf0[40]; |
2048 | 485k | bf1[6] = bf0[24]; |
2049 | 485k | bf1[7] = bf0[56]; |
2050 | 485k | bf1[8] = bf0[4]; |
2051 | 485k | bf1[9] = bf0[36]; |
2052 | 485k | bf1[10] = bf0[20]; |
2053 | 485k | bf1[11] = bf0[52]; |
2054 | 485k | bf1[12] = bf0[12]; |
2055 | 485k | bf1[13] = bf0[44]; |
2056 | 485k | bf1[14] = bf0[28]; |
2057 | 485k | bf1[15] = bf0[60]; |
2058 | 485k | bf1[16] = bf0[2]; |
2059 | 485k | bf1[17] = bf0[34]; |
2060 | 485k | bf1[18] = bf0[18]; |
2061 | 485k | bf1[19] = bf0[50]; |
2062 | 485k | bf1[20] = bf0[10]; |
2063 | 485k | bf1[21] = bf0[42]; |
2064 | 485k | bf1[22] = bf0[26]; |
2065 | 485k | bf1[23] = bf0[58]; |
2066 | 485k | bf1[24] = bf0[6]; |
2067 | 485k | bf1[25] = bf0[38]; |
2068 | 485k | bf1[26] = bf0[22]; |
2069 | 485k | bf1[27] = bf0[54]; |
2070 | 485k | bf1[28] = bf0[14]; |
2071 | 485k | bf1[29] = bf0[46]; |
2072 | 485k | bf1[30] = bf0[30]; |
2073 | 485k | bf1[31] = bf0[62]; |
2074 | 485k | bf1[32] = bf0[1]; |
2075 | 485k | bf1[33] = bf0[33]; |
2076 | 485k | bf1[34] = bf0[17]; |
2077 | 485k | bf1[35] = bf0[49]; |
2078 | 485k | bf1[36] = bf0[9]; |
2079 | 485k | bf1[37] = bf0[41]; |
2080 | 485k | bf1[38] = bf0[25]; |
2081 | 485k | bf1[39] = bf0[57]; |
2082 | 485k | bf1[40] = bf0[5]; |
2083 | 485k | bf1[41] = bf0[37]; |
2084 | 485k | bf1[42] = bf0[21]; |
2085 | 485k | bf1[43] = bf0[53]; |
2086 | 485k | bf1[44] = bf0[13]; |
2087 | 485k | bf1[45] = bf0[45]; |
2088 | 485k | bf1[46] = bf0[29]; |
2089 | 485k | bf1[47] = bf0[61]; |
2090 | 485k | bf1[48] = bf0[3]; |
2091 | 485k | bf1[49] = bf0[35]; |
2092 | 485k | bf1[50] = bf0[19]; |
2093 | 485k | bf1[51] = bf0[51]; |
2094 | 485k | bf1[52] = bf0[11]; |
2095 | 485k | bf1[53] = bf0[43]; |
2096 | 485k | bf1[54] = bf0[27]; |
2097 | 485k | bf1[55] = bf0[59]; |
2098 | 485k | bf1[56] = bf0[7]; |
2099 | 485k | bf1[57] = bf0[39]; |
2100 | 485k | bf1[58] = bf0[23]; |
2101 | 485k | bf1[59] = bf0[55]; |
2102 | 485k | bf1[60] = bf0[15]; |
2103 | 485k | bf1[61] = bf0[47]; |
2104 | 485k | bf1[62] = bf0[31]; |
2105 | 485k | bf1[63] = bf0[63]; |
2106 | 485k | } |
2107 | | |
2108 | 0 | void svt_av1_fadst4_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2109 | 0 | (void)stage_range; |
2110 | 0 | int32_t bit = cos_bit; |
2111 | 0 | const int32_t* sinpi = sinpi_arr(bit); |
2112 | 0 | int32_t x0, x1, x2, x3; |
2113 | 0 | int32_t s0, s1, s2, s3, s4, s5, s6, s7; |
2114 | | |
2115 | | // stage 0 |
2116 | 0 | x0 = input[0]; |
2117 | 0 | x1 = input[1]; |
2118 | 0 | x2 = input[2]; |
2119 | 0 | x3 = input[3]; |
2120 | |
|
2121 | 0 | if (!(x0 | x1 | x2 | x3)) { |
2122 | 0 | output[0] = output[1] = output[2] = output[3] = 0; |
2123 | 0 | return; |
2124 | 0 | } |
2125 | | |
2126 | | //// stage 1 |
2127 | | //s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]); |
2128 | | //s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]); |
2129 | | //s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]); |
2130 | | //s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]); |
2131 | | //s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]); |
2132 | | //s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]); |
2133 | | //s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]); |
2134 | | //s7 = range_check_value(x0 + x1, stage_range[1]); |
2135 | | |
2136 | | //// stage 2 |
2137 | | //s7 = range_check_value(s7 - x3, stage_range[2]); |
2138 | | |
2139 | | //// stage 3 |
2140 | | //x0 = range_check_value(s0 + s2, bit + stage_range[3]); |
2141 | | //x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]); |
2142 | | //x2 = range_check_value(s1 - s3, bit + stage_range[3]); |
2143 | | //x3 = range_check_value(s4, bit + stage_range[3]); |
2144 | | |
2145 | | //// stage 4 |
2146 | | //x0 = range_check_value(x0 + s5, bit + stage_range[4]); |
2147 | | //x2 = range_check_value(x2 + s6, bit + stage_range[4]); |
2148 | | |
2149 | | //// stage 5 |
2150 | | //s0 = range_check_value(x0 + x3, bit + stage_range[5]); |
2151 | | //s1 = range_check_value(x1, bit + stage_range[5]); |
2152 | | //s2 = range_check_value(x2 - x3, bit + stage_range[5]); |
2153 | | //s3 = range_check_value(x2 - x0, bit + stage_range[5]); |
2154 | | |
2155 | | //// stage 6 |
2156 | | //s3 = range_check_value(s3 + x3, bit + stage_range[6]); |
2157 | | |
2158 | | // stage 1 |
2159 | 0 | s0 = sinpi[1] * x0; |
2160 | 0 | s1 = sinpi[4] * x0; |
2161 | 0 | s2 = sinpi[2] * x1; |
2162 | 0 | s3 = sinpi[1] * x1; |
2163 | 0 | s4 = sinpi[3] * x2; |
2164 | 0 | s5 = sinpi[4] * x3; |
2165 | 0 | s6 = sinpi[2] * x3; |
2166 | 0 | s7 = x0 + x1; |
2167 | | |
2168 | | // stage 2 |
2169 | 0 | s7 = s7 - x3; |
2170 | | |
2171 | | // stage 3 |
2172 | 0 | x0 = s0 + s2; |
2173 | 0 | x1 = sinpi[3] * s7; |
2174 | 0 | x2 = s1 - s3; |
2175 | 0 | x3 = s4; |
2176 | | |
2177 | | // stage 4 |
2178 | 0 | x0 = x0 + s5; |
2179 | 0 | x2 = x2 + s6; |
2180 | | |
2181 | | // stage 5 |
2182 | 0 | s0 = x0 + x3; |
2183 | 0 | s1 = x1; |
2184 | 0 | s2 = x2 - x3; |
2185 | 0 | s3 = x2 - x0; |
2186 | | |
2187 | | // stage 6 |
2188 | 0 | s3 = s3 + x3; |
2189 | | |
2190 | | // 1-D transform scaling factor is sqrt(2). |
2191 | 0 | output[0] = round_shift(s0, bit); |
2192 | 0 | output[1] = round_shift(s1, bit); |
2193 | 0 | output[2] = round_shift(s2, bit); |
2194 | 0 | output[3] = round_shift(s3, bit); |
2195 | 0 | } |
2196 | | |
2197 | 0 | void svt_av1_fadst8_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2198 | 0 | (void)stage_range; |
2199 | 0 | const int32_t* cospi; |
2200 | |
|
2201 | 0 | int32_t *bf0, *bf1; |
2202 | 0 | int32_t step[8]; |
2203 | | |
2204 | | // stage 0; |
2205 | | |
2206 | | // stage 1; |
2207 | 0 | assert(output != input); |
2208 | 0 | bf1 = output; |
2209 | 0 | bf1[0] = input[0]; |
2210 | 0 | bf1[1] = -input[7]; |
2211 | 0 | bf1[2] = -input[3]; |
2212 | 0 | bf1[3] = input[4]; |
2213 | 0 | bf1[4] = -input[1]; |
2214 | 0 | bf1[5] = input[6]; |
2215 | 0 | bf1[6] = input[2]; |
2216 | 0 | bf1[7] = -input[5]; |
2217 | | |
2218 | | // stage 2 |
2219 | 0 | cospi = cospi_arr(cos_bit); |
2220 | 0 | bf0 = output; |
2221 | 0 | bf1 = step; |
2222 | 0 | bf1[0] = bf0[0]; |
2223 | 0 | bf1[1] = bf0[1]; |
2224 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
2225 | 0 | bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); |
2226 | 0 | bf1[4] = bf0[4]; |
2227 | 0 | bf1[5] = bf0[5]; |
2228 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
2229 | 0 | bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); |
2230 | | |
2231 | | // stage 3 |
2232 | 0 | bf0 = step; |
2233 | 0 | bf1 = output; |
2234 | 0 | bf1[0] = bf0[0] + bf0[2]; |
2235 | 0 | bf1[1] = bf0[1] + bf0[3]; |
2236 | 0 | bf1[2] = bf0[0] - bf0[2]; |
2237 | 0 | bf1[3] = bf0[1] - bf0[3]; |
2238 | 0 | bf1[4] = bf0[4] + bf0[6]; |
2239 | 0 | bf1[5] = bf0[5] + bf0[7]; |
2240 | 0 | bf1[6] = bf0[4] - bf0[6]; |
2241 | 0 | bf1[7] = bf0[5] - bf0[7]; |
2242 | | |
2243 | | // stage 4 |
2244 | 0 | cospi = cospi_arr(cos_bit); |
2245 | 0 | bf0 = output; |
2246 | 0 | bf1 = step; |
2247 | 0 | bf1[0] = bf0[0]; |
2248 | 0 | bf1[1] = bf0[1]; |
2249 | 0 | bf1[2] = bf0[2]; |
2250 | 0 | bf1[3] = bf0[3]; |
2251 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
2252 | 0 | bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); |
2253 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
2254 | 0 | bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); |
2255 | | |
2256 | | // stage 5 |
2257 | 0 | bf0 = step; |
2258 | 0 | bf1 = output; |
2259 | 0 | bf1[0] = bf0[0] + bf0[4]; |
2260 | 0 | bf1[1] = bf0[1] + bf0[5]; |
2261 | 0 | bf1[2] = bf0[2] + bf0[6]; |
2262 | 0 | bf1[3] = bf0[3] + bf0[7]; |
2263 | 0 | bf1[4] = bf0[0] - bf0[4]; |
2264 | 0 | bf1[5] = bf0[1] - bf0[5]; |
2265 | 0 | bf1[6] = bf0[2] - bf0[6]; |
2266 | 0 | bf1[7] = bf0[3] - bf0[7]; |
2267 | | |
2268 | | // stage 6 |
2269 | 0 | cospi = cospi_arr(cos_bit); |
2270 | 0 | bf0 = output; |
2271 | 0 | bf1 = step; |
2272 | 0 | bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); |
2273 | 0 | bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); |
2274 | 0 | bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); |
2275 | 0 | bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); |
2276 | 0 | bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); |
2277 | 0 | bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); |
2278 | 0 | bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); |
2279 | 0 | bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); |
2280 | | |
2281 | | // stage 7 |
2282 | 0 | bf0 = step; |
2283 | 0 | bf1 = output; |
2284 | 0 | bf1[0] = bf0[1]; |
2285 | 0 | bf1[1] = bf0[6]; |
2286 | 0 | bf1[2] = bf0[3]; |
2287 | 0 | bf1[3] = bf0[4]; |
2288 | 0 | bf1[4] = bf0[5]; |
2289 | 0 | bf1[5] = bf0[2]; |
2290 | 0 | bf1[6] = bf0[7]; |
2291 | 0 | bf1[7] = bf0[0]; |
2292 | 0 | } |
2293 | | |
2294 | 0 | void svt_av1_fadst16_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2295 | 0 | (void)stage_range; |
2296 | 0 | const int32_t* cospi; |
2297 | |
|
2298 | 0 | int32_t *bf0, *bf1; |
2299 | 0 | int32_t step[16]; |
2300 | | |
2301 | | // stage 0; |
2302 | | |
2303 | | // stage 1; |
2304 | 0 | assert(output != input); |
2305 | 0 | bf1 = output; |
2306 | 0 | bf1[0] = input[0]; |
2307 | 0 | bf1[1] = -input[15]; |
2308 | 0 | bf1[2] = -input[7]; |
2309 | 0 | bf1[3] = input[8]; |
2310 | 0 | bf1[4] = -input[3]; |
2311 | 0 | bf1[5] = input[12]; |
2312 | 0 | bf1[6] = input[4]; |
2313 | 0 | bf1[7] = -input[11]; |
2314 | 0 | bf1[8] = -input[1]; |
2315 | 0 | bf1[9] = input[14]; |
2316 | 0 | bf1[10] = input[6]; |
2317 | 0 | bf1[11] = -input[9]; |
2318 | 0 | bf1[12] = input[2]; |
2319 | 0 | bf1[13] = -input[13]; |
2320 | 0 | bf1[14] = -input[5]; |
2321 | 0 | bf1[15] = input[10]; |
2322 | | |
2323 | | // stage 2 |
2324 | 0 | cospi = cospi_arr(cos_bit); |
2325 | 0 | bf0 = output; |
2326 | 0 | bf1 = step; |
2327 | 0 | bf1[0] = bf0[0]; |
2328 | 0 | bf1[1] = bf0[1]; |
2329 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
2330 | 0 | bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); |
2331 | 0 | bf1[4] = bf0[4]; |
2332 | 0 | bf1[5] = bf0[5]; |
2333 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
2334 | 0 | bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); |
2335 | 0 | bf1[8] = bf0[8]; |
2336 | 0 | bf1[9] = bf0[9]; |
2337 | 0 | bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); |
2338 | 0 | bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); |
2339 | 0 | bf1[12] = bf0[12]; |
2340 | 0 | bf1[13] = bf0[13]; |
2341 | 0 | bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); |
2342 | 0 | bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); |
2343 | | |
2344 | | // stage 3 |
2345 | 0 | bf0 = step; |
2346 | 0 | bf1 = output; |
2347 | 0 | bf1[0] = bf0[0] + bf0[2]; |
2348 | 0 | bf1[1] = bf0[1] + bf0[3]; |
2349 | 0 | bf1[2] = bf0[0] - bf0[2]; |
2350 | 0 | bf1[3] = bf0[1] - bf0[3]; |
2351 | 0 | bf1[4] = bf0[4] + bf0[6]; |
2352 | 0 | bf1[5] = bf0[5] + bf0[7]; |
2353 | 0 | bf1[6] = bf0[4] - bf0[6]; |
2354 | 0 | bf1[7] = bf0[5] - bf0[7]; |
2355 | 0 | bf1[8] = bf0[8] + bf0[10]; |
2356 | 0 | bf1[9] = bf0[9] + bf0[11]; |
2357 | 0 | bf1[10] = bf0[8] - bf0[10]; |
2358 | 0 | bf1[11] = bf0[9] - bf0[11]; |
2359 | 0 | bf1[12] = bf0[12] + bf0[14]; |
2360 | 0 | bf1[13] = bf0[13] + bf0[15]; |
2361 | 0 | bf1[14] = bf0[12] - bf0[14]; |
2362 | 0 | bf1[15] = bf0[13] - bf0[15]; |
2363 | | |
2364 | | // stage 4 |
2365 | 0 | cospi = cospi_arr(cos_bit); |
2366 | 0 | bf0 = output; |
2367 | 0 | bf1 = step; |
2368 | 0 | bf1[0] = bf0[0]; |
2369 | 0 | bf1[1] = bf0[1]; |
2370 | 0 | bf1[2] = bf0[2]; |
2371 | 0 | bf1[3] = bf0[3]; |
2372 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
2373 | 0 | bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); |
2374 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
2375 | 0 | bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); |
2376 | 0 | bf1[8] = bf0[8]; |
2377 | 0 | bf1[9] = bf0[9]; |
2378 | 0 | bf1[10] = bf0[10]; |
2379 | 0 | bf1[11] = bf0[11]; |
2380 | 0 | bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); |
2381 | 0 | bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); |
2382 | 0 | bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); |
2383 | 0 | bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); |
2384 | | |
2385 | | // stage 5 |
2386 | 0 | bf0 = step; |
2387 | 0 | bf1 = output; |
2388 | 0 | bf1[0] = bf0[0] + bf0[4]; |
2389 | 0 | bf1[1] = bf0[1] + bf0[5]; |
2390 | 0 | bf1[2] = bf0[2] + bf0[6]; |
2391 | 0 | bf1[3] = bf0[3] + bf0[7]; |
2392 | 0 | bf1[4] = bf0[0] - bf0[4]; |
2393 | 0 | bf1[5] = bf0[1] - bf0[5]; |
2394 | 0 | bf1[6] = bf0[2] - bf0[6]; |
2395 | 0 | bf1[7] = bf0[3] - bf0[7]; |
2396 | 0 | bf1[8] = bf0[8] + bf0[12]; |
2397 | 0 | bf1[9] = bf0[9] + bf0[13]; |
2398 | 0 | bf1[10] = bf0[10] + bf0[14]; |
2399 | 0 | bf1[11] = bf0[11] + bf0[15]; |
2400 | 0 | bf1[12] = bf0[8] - bf0[12]; |
2401 | 0 | bf1[13] = bf0[9] - bf0[13]; |
2402 | 0 | bf1[14] = bf0[10] - bf0[14]; |
2403 | 0 | bf1[15] = bf0[11] - bf0[15]; |
2404 | | |
2405 | | // stage 6 |
2406 | 0 | cospi = cospi_arr(cos_bit); |
2407 | 0 | bf0 = output; |
2408 | 0 | bf1 = step; |
2409 | 0 | bf1[0] = bf0[0]; |
2410 | 0 | bf1[1] = bf0[1]; |
2411 | 0 | bf1[2] = bf0[2]; |
2412 | 0 | bf1[3] = bf0[3]; |
2413 | 0 | bf1[4] = bf0[4]; |
2414 | 0 | bf1[5] = bf0[5]; |
2415 | 0 | bf1[6] = bf0[6]; |
2416 | 0 | bf1[7] = bf0[7]; |
2417 | 0 | bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); |
2418 | 0 | bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); |
2419 | 0 | bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); |
2420 | 0 | bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); |
2421 | 0 | bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); |
2422 | 0 | bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); |
2423 | 0 | bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); |
2424 | 0 | bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); |
2425 | | |
2426 | | // stage 7 |
2427 | 0 | bf0 = step; |
2428 | 0 | bf1 = output; |
2429 | 0 | bf1[0] = bf0[0] + bf0[8]; |
2430 | 0 | bf1[1] = bf0[1] + bf0[9]; |
2431 | 0 | bf1[2] = bf0[2] + bf0[10]; |
2432 | 0 | bf1[3] = bf0[3] + bf0[11]; |
2433 | 0 | bf1[4] = bf0[4] + bf0[12]; |
2434 | 0 | bf1[5] = bf0[5] + bf0[13]; |
2435 | 0 | bf1[6] = bf0[6] + bf0[14]; |
2436 | 0 | bf1[7] = bf0[7] + bf0[15]; |
2437 | 0 | bf1[8] = bf0[0] - bf0[8]; |
2438 | 0 | bf1[9] = bf0[1] - bf0[9]; |
2439 | 0 | bf1[10] = bf0[2] - bf0[10]; |
2440 | 0 | bf1[11] = bf0[3] - bf0[11]; |
2441 | 0 | bf1[12] = bf0[4] - bf0[12]; |
2442 | 0 | bf1[13] = bf0[5] - bf0[13]; |
2443 | 0 | bf1[14] = bf0[6] - bf0[14]; |
2444 | 0 | bf1[15] = bf0[7] - bf0[15]; |
2445 | | |
2446 | | // stage 8 |
2447 | 0 | cospi = cospi_arr(cos_bit); |
2448 | 0 | bf0 = output; |
2449 | 0 | bf1 = step; |
2450 | 0 | bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); |
2451 | 0 | bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); |
2452 | 0 | bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); |
2453 | 0 | bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); |
2454 | 0 | bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); |
2455 | 0 | bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); |
2456 | 0 | bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); |
2457 | 0 | bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); |
2458 | 0 | bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); |
2459 | 0 | bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); |
2460 | 0 | bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); |
2461 | 0 | bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); |
2462 | 0 | bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); |
2463 | 0 | bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); |
2464 | 0 | bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); |
2465 | 0 | bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); |
2466 | | |
2467 | | // stage 9 |
2468 | 0 | bf0 = step; |
2469 | 0 | bf1 = output; |
2470 | 0 | bf1[0] = bf0[1]; |
2471 | 0 | bf1[1] = bf0[14]; |
2472 | 0 | bf1[2] = bf0[3]; |
2473 | 0 | bf1[3] = bf0[12]; |
2474 | 0 | bf1[4] = bf0[5]; |
2475 | 0 | bf1[5] = bf0[10]; |
2476 | 0 | bf1[6] = bf0[7]; |
2477 | 0 | bf1[7] = bf0[8]; |
2478 | 0 | bf1[8] = bf0[9]; |
2479 | 0 | bf1[9] = bf0[6]; |
2480 | 0 | bf1[10] = bf0[11]; |
2481 | 0 | bf1[11] = bf0[4]; |
2482 | 0 | bf1[12] = bf0[13]; |
2483 | 0 | bf1[13] = bf0[2]; |
2484 | 0 | bf1[14] = bf0[15]; |
2485 | 0 | bf1[15] = bf0[0]; |
2486 | 0 | } |
2487 | | |
2488 | 0 | static void av1_fadst32_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2489 | 0 | (void)stage_range; |
2490 | 0 | const int32_t* cospi; |
2491 | |
|
2492 | 0 | int32_t *bf0, *bf1; |
2493 | 0 | int32_t step[32]; |
2494 | | |
2495 | | // stage 0; |
2496 | | |
2497 | | // stage 1; |
2498 | 0 | bf1 = output; |
2499 | 0 | bf1[0] = input[31]; |
2500 | 0 | bf1[1] = input[0]; |
2501 | 0 | bf1[2] = input[29]; |
2502 | 0 | bf1[3] = input[2]; |
2503 | 0 | bf1[4] = input[27]; |
2504 | 0 | bf1[5] = input[4]; |
2505 | 0 | bf1[6] = input[25]; |
2506 | 0 | bf1[7] = input[6]; |
2507 | 0 | bf1[8] = input[23]; |
2508 | 0 | bf1[9] = input[8]; |
2509 | 0 | bf1[10] = input[21]; |
2510 | 0 | bf1[11] = input[10]; |
2511 | 0 | bf1[12] = input[19]; |
2512 | 0 | bf1[13] = input[12]; |
2513 | 0 | bf1[14] = input[17]; |
2514 | 0 | bf1[15] = input[14]; |
2515 | 0 | bf1[16] = input[15]; |
2516 | 0 | bf1[17] = input[16]; |
2517 | 0 | bf1[18] = input[13]; |
2518 | 0 | bf1[19] = input[18]; |
2519 | 0 | bf1[20] = input[11]; |
2520 | 0 | bf1[21] = input[20]; |
2521 | 0 | bf1[22] = input[9]; |
2522 | 0 | bf1[23] = input[22]; |
2523 | 0 | bf1[24] = input[7]; |
2524 | 0 | bf1[25] = input[24]; |
2525 | 0 | bf1[26] = input[5]; |
2526 | 0 | bf1[27] = input[26]; |
2527 | 0 | bf1[28] = input[3]; |
2528 | 0 | bf1[29] = input[28]; |
2529 | 0 | bf1[30] = input[1]; |
2530 | 0 | bf1[31] = input[30]; |
2531 | | |
2532 | | // stage 2 |
2533 | 0 | cospi = cospi_arr(cos_bit); |
2534 | 0 | bf0 = output; |
2535 | 0 | bf1 = step; |
2536 | 0 | bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit); |
2537 | 0 | bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit); |
2538 | 0 | bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit); |
2539 | 0 | bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit); |
2540 | 0 | bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit); |
2541 | 0 | bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit); |
2542 | 0 | bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit); |
2543 | 0 | bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit); |
2544 | 0 | bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit); |
2545 | 0 | bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit); |
2546 | 0 | bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit); |
2547 | 0 | bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit); |
2548 | 0 | bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit); |
2549 | 0 | bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit); |
2550 | 0 | bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit); |
2551 | 0 | bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit); |
2552 | 0 | bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit); |
2553 | 0 | bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit); |
2554 | 0 | bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit); |
2555 | 0 | bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit); |
2556 | 0 | bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit); |
2557 | 0 | bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit); |
2558 | 0 | bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit); |
2559 | 0 | bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit); |
2560 | 0 | bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit); |
2561 | 0 | bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit); |
2562 | 0 | bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit); |
2563 | 0 | bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit); |
2564 | 0 | bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit); |
2565 | 0 | bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit); |
2566 | 0 | bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit); |
2567 | 0 | bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit); |
2568 | | |
2569 | | // stage 3 |
2570 | 0 | bf0 = step; |
2571 | 0 | bf1 = output; |
2572 | 0 | bf1[0] = bf0[0] + bf0[16]; |
2573 | 0 | bf1[1] = bf0[1] + bf0[17]; |
2574 | 0 | bf1[2] = bf0[2] + bf0[18]; |
2575 | 0 | bf1[3] = bf0[3] + bf0[19]; |
2576 | 0 | bf1[4] = bf0[4] + bf0[20]; |
2577 | 0 | bf1[5] = bf0[5] + bf0[21]; |
2578 | 0 | bf1[6] = bf0[6] + bf0[22]; |
2579 | 0 | bf1[7] = bf0[7] + bf0[23]; |
2580 | 0 | bf1[8] = bf0[8] + bf0[24]; |
2581 | 0 | bf1[9] = bf0[9] + bf0[25]; |
2582 | 0 | bf1[10] = bf0[10] + bf0[26]; |
2583 | 0 | bf1[11] = bf0[11] + bf0[27]; |
2584 | 0 | bf1[12] = bf0[12] + bf0[28]; |
2585 | 0 | bf1[13] = bf0[13] + bf0[29]; |
2586 | 0 | bf1[14] = bf0[14] + bf0[30]; |
2587 | 0 | bf1[15] = bf0[15] + bf0[31]; |
2588 | 0 | bf1[16] = -bf0[16] + bf0[0]; |
2589 | 0 | bf1[17] = -bf0[17] + bf0[1]; |
2590 | 0 | bf1[18] = -bf0[18] + bf0[2]; |
2591 | 0 | bf1[19] = -bf0[19] + bf0[3]; |
2592 | 0 | bf1[20] = -bf0[20] + bf0[4]; |
2593 | 0 | bf1[21] = -bf0[21] + bf0[5]; |
2594 | 0 | bf1[22] = -bf0[22] + bf0[6]; |
2595 | 0 | bf1[23] = -bf0[23] + bf0[7]; |
2596 | 0 | bf1[24] = -bf0[24] + bf0[8]; |
2597 | 0 | bf1[25] = -bf0[25] + bf0[9]; |
2598 | 0 | bf1[26] = -bf0[26] + bf0[10]; |
2599 | 0 | bf1[27] = -bf0[27] + bf0[11]; |
2600 | 0 | bf1[28] = -bf0[28] + bf0[12]; |
2601 | 0 | bf1[29] = -bf0[29] + bf0[13]; |
2602 | 0 | bf1[30] = -bf0[30] + bf0[14]; |
2603 | 0 | bf1[31] = -bf0[31] + bf0[15]; |
2604 | | |
2605 | | // stage 4 |
2606 | 0 | cospi = cospi_arr(cos_bit); |
2607 | 0 | bf0 = output; |
2608 | 0 | bf1 = step; |
2609 | 0 | bf1[0] = bf0[0]; |
2610 | 0 | bf1[1] = bf0[1]; |
2611 | 0 | bf1[2] = bf0[2]; |
2612 | 0 | bf1[3] = bf0[3]; |
2613 | 0 | bf1[4] = bf0[4]; |
2614 | 0 | bf1[5] = bf0[5]; |
2615 | 0 | bf1[6] = bf0[6]; |
2616 | 0 | bf1[7] = bf0[7]; |
2617 | 0 | bf1[8] = bf0[8]; |
2618 | 0 | bf1[9] = bf0[9]; |
2619 | 0 | bf1[10] = bf0[10]; |
2620 | 0 | bf1[11] = bf0[11]; |
2621 | 0 | bf1[12] = bf0[12]; |
2622 | 0 | bf1[13] = bf0[13]; |
2623 | 0 | bf1[14] = bf0[14]; |
2624 | 0 | bf1[15] = bf0[15]; |
2625 | 0 | bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit); |
2626 | 0 | bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit); |
2627 | 0 | bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit); |
2628 | 0 | bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit); |
2629 | 0 | bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit); |
2630 | 0 | bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit); |
2631 | 0 | bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit); |
2632 | 0 | bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit); |
2633 | 0 | bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit); |
2634 | 0 | bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit); |
2635 | 0 | bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit); |
2636 | 0 | bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit); |
2637 | 0 | bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit); |
2638 | 0 | bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit); |
2639 | 0 | bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit); |
2640 | 0 | bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit); |
2641 | | |
2642 | | // stage 5 |
2643 | 0 | bf0 = step; |
2644 | 0 | bf1 = output; |
2645 | 0 | bf1[0] = bf0[0] + bf0[8]; |
2646 | 0 | bf1[1] = bf0[1] + bf0[9]; |
2647 | 0 | bf1[2] = bf0[2] + bf0[10]; |
2648 | 0 | bf1[3] = bf0[3] + bf0[11]; |
2649 | 0 | bf1[4] = bf0[4] + bf0[12]; |
2650 | 0 | bf1[5] = bf0[5] + bf0[13]; |
2651 | 0 | bf1[6] = bf0[6] + bf0[14]; |
2652 | 0 | bf1[7] = bf0[7] + bf0[15]; |
2653 | 0 | bf1[8] = -bf0[8] + bf0[0]; |
2654 | 0 | bf1[9] = -bf0[9] + bf0[1]; |
2655 | 0 | bf1[10] = -bf0[10] + bf0[2]; |
2656 | 0 | bf1[11] = -bf0[11] + bf0[3]; |
2657 | 0 | bf1[12] = -bf0[12] + bf0[4]; |
2658 | 0 | bf1[13] = -bf0[13] + bf0[5]; |
2659 | 0 | bf1[14] = -bf0[14] + bf0[6]; |
2660 | 0 | bf1[15] = -bf0[15] + bf0[7]; |
2661 | 0 | bf1[16] = bf0[16] + bf0[24]; |
2662 | 0 | bf1[17] = bf0[17] + bf0[25]; |
2663 | 0 | bf1[18] = bf0[18] + bf0[26]; |
2664 | 0 | bf1[19] = bf0[19] + bf0[27]; |
2665 | 0 | bf1[20] = bf0[20] + bf0[28]; |
2666 | 0 | bf1[21] = bf0[21] + bf0[29]; |
2667 | 0 | bf1[22] = bf0[22] + bf0[30]; |
2668 | 0 | bf1[23] = bf0[23] + bf0[31]; |
2669 | 0 | bf1[24] = -bf0[24] + bf0[16]; |
2670 | 0 | bf1[25] = -bf0[25] + bf0[17]; |
2671 | 0 | bf1[26] = -bf0[26] + bf0[18]; |
2672 | 0 | bf1[27] = -bf0[27] + bf0[19]; |
2673 | 0 | bf1[28] = -bf0[28] + bf0[20]; |
2674 | 0 | bf1[29] = -bf0[29] + bf0[21]; |
2675 | 0 | bf1[30] = -bf0[30] + bf0[22]; |
2676 | 0 | bf1[31] = -bf0[31] + bf0[23]; |
2677 | | |
2678 | | // stage 6 |
2679 | 0 | cospi = cospi_arr(cos_bit); |
2680 | 0 | bf0 = output; |
2681 | 0 | bf1 = step; |
2682 | 0 | bf1[0] = bf0[0]; |
2683 | 0 | bf1[1] = bf0[1]; |
2684 | 0 | bf1[2] = bf0[2]; |
2685 | 0 | bf1[3] = bf0[3]; |
2686 | 0 | bf1[4] = bf0[4]; |
2687 | 0 | bf1[5] = bf0[5]; |
2688 | 0 | bf1[6] = bf0[6]; |
2689 | 0 | bf1[7] = bf0[7]; |
2690 | 0 | bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); |
2691 | 0 | bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit); |
2692 | 0 | bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); |
2693 | 0 | bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit); |
2694 | 0 | bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); |
2695 | 0 | bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit); |
2696 | 0 | bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); |
2697 | 0 | bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit); |
2698 | 0 | bf1[16] = bf0[16]; |
2699 | 0 | bf1[17] = bf0[17]; |
2700 | 0 | bf1[18] = bf0[18]; |
2701 | 0 | bf1[19] = bf0[19]; |
2702 | 0 | bf1[20] = bf0[20]; |
2703 | 0 | bf1[21] = bf0[21]; |
2704 | 0 | bf1[22] = bf0[22]; |
2705 | 0 | bf1[23] = bf0[23]; |
2706 | 0 | bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit); |
2707 | 0 | bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit); |
2708 | 0 | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit); |
2709 | 0 | bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit); |
2710 | 0 | bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit); |
2711 | 0 | bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit); |
2712 | 0 | bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit); |
2713 | 0 | bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit); |
2714 | | |
2715 | | // stage 7 |
2716 | 0 | bf0 = step; |
2717 | 0 | bf1 = output; |
2718 | 0 | bf1[0] = bf0[0] + bf0[4]; |
2719 | 0 | bf1[1] = bf0[1] + bf0[5]; |
2720 | 0 | bf1[2] = bf0[2] + bf0[6]; |
2721 | 0 | bf1[3] = bf0[3] + bf0[7]; |
2722 | 0 | bf1[4] = -bf0[4] + bf0[0]; |
2723 | 0 | bf1[5] = -bf0[5] + bf0[1]; |
2724 | 0 | bf1[6] = -bf0[6] + bf0[2]; |
2725 | 0 | bf1[7] = -bf0[7] + bf0[3]; |
2726 | 0 | bf1[8] = bf0[8] + bf0[12]; |
2727 | 0 | bf1[9] = bf0[9] + bf0[13]; |
2728 | 0 | bf1[10] = bf0[10] + bf0[14]; |
2729 | 0 | bf1[11] = bf0[11] + bf0[15]; |
2730 | 0 | bf1[12] = -bf0[12] + bf0[8]; |
2731 | 0 | bf1[13] = -bf0[13] + bf0[9]; |
2732 | 0 | bf1[14] = -bf0[14] + bf0[10]; |
2733 | 0 | bf1[15] = -bf0[15] + bf0[11]; |
2734 | 0 | bf1[16] = bf0[16] + bf0[20]; |
2735 | 0 | bf1[17] = bf0[17] + bf0[21]; |
2736 | 0 | bf1[18] = bf0[18] + bf0[22]; |
2737 | 0 | bf1[19] = bf0[19] + bf0[23]; |
2738 | 0 | bf1[20] = -bf0[20] + bf0[16]; |
2739 | 0 | bf1[21] = -bf0[21] + bf0[17]; |
2740 | 0 | bf1[22] = -bf0[22] + bf0[18]; |
2741 | 0 | bf1[23] = -bf0[23] + bf0[19]; |
2742 | 0 | bf1[24] = bf0[24] + bf0[28]; |
2743 | 0 | bf1[25] = bf0[25] + bf0[29]; |
2744 | 0 | bf1[26] = bf0[26] + bf0[30]; |
2745 | 0 | bf1[27] = bf0[27] + bf0[31]; |
2746 | 0 | bf1[28] = -bf0[28] + bf0[24]; |
2747 | 0 | bf1[29] = -bf0[29] + bf0[25]; |
2748 | 0 | bf1[30] = -bf0[30] + bf0[26]; |
2749 | 0 | bf1[31] = -bf0[31] + bf0[27]; |
2750 | | |
2751 | | // stage 8 |
2752 | 0 | cospi = cospi_arr(cos_bit); |
2753 | 0 | bf0 = output; |
2754 | 0 | bf1 = step; |
2755 | 0 | bf1[0] = bf0[0]; |
2756 | 0 | bf1[1] = bf0[1]; |
2757 | 0 | bf1[2] = bf0[2]; |
2758 | 0 | bf1[3] = bf0[3]; |
2759 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
2760 | 0 | bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit); |
2761 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
2762 | 0 | bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit); |
2763 | 0 | bf1[8] = bf0[8]; |
2764 | 0 | bf1[9] = bf0[9]; |
2765 | 0 | bf1[10] = bf0[10]; |
2766 | 0 | bf1[11] = bf0[11]; |
2767 | 0 | bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); |
2768 | 0 | bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit); |
2769 | 0 | bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); |
2770 | 0 | bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit); |
2771 | 0 | bf1[16] = bf0[16]; |
2772 | 0 | bf1[17] = bf0[17]; |
2773 | 0 | bf1[18] = bf0[18]; |
2774 | 0 | bf1[19] = bf0[19]; |
2775 | 0 | bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit); |
2776 | 0 | bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit); |
2777 | 0 | bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit); |
2778 | 0 | bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit); |
2779 | 0 | bf1[24] = bf0[24]; |
2780 | 0 | bf1[25] = bf0[25]; |
2781 | 0 | bf1[26] = bf0[26]; |
2782 | 0 | bf1[27] = bf0[27]; |
2783 | 0 | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit); |
2784 | 0 | bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit); |
2785 | 0 | bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit); |
2786 | 0 | bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit); |
2787 | | |
2788 | | // stage 9 |
2789 | 0 | bf0 = step; |
2790 | 0 | bf1 = output; |
2791 | 0 | bf1[0] = bf0[0] + bf0[2]; |
2792 | 0 | bf1[1] = bf0[1] + bf0[3]; |
2793 | 0 | bf1[2] = -bf0[2] + bf0[0]; |
2794 | 0 | bf1[3] = -bf0[3] + bf0[1]; |
2795 | 0 | bf1[4] = bf0[4] + bf0[6]; |
2796 | 0 | bf1[5] = bf0[5] + bf0[7]; |
2797 | 0 | bf1[6] = -bf0[6] + bf0[4]; |
2798 | 0 | bf1[7] = -bf0[7] + bf0[5]; |
2799 | 0 | bf1[8] = bf0[8] + bf0[10]; |
2800 | 0 | bf1[9] = bf0[9] + bf0[11]; |
2801 | 0 | bf1[10] = -bf0[10] + bf0[8]; |
2802 | 0 | bf1[11] = -bf0[11] + bf0[9]; |
2803 | 0 | bf1[12] = bf0[12] + bf0[14]; |
2804 | 0 | bf1[13] = bf0[13] + bf0[15]; |
2805 | 0 | bf1[14] = -bf0[14] + bf0[12]; |
2806 | 0 | bf1[15] = -bf0[15] + bf0[13]; |
2807 | 0 | bf1[16] = bf0[16] + bf0[18]; |
2808 | 0 | bf1[17] = bf0[17] + bf0[19]; |
2809 | 0 | bf1[18] = -bf0[18] + bf0[16]; |
2810 | 0 | bf1[19] = -bf0[19] + bf0[17]; |
2811 | 0 | bf1[20] = bf0[20] + bf0[22]; |
2812 | 0 | bf1[21] = bf0[21] + bf0[23]; |
2813 | 0 | bf1[22] = -bf0[22] + bf0[20]; |
2814 | 0 | bf1[23] = -bf0[23] + bf0[21]; |
2815 | 0 | bf1[24] = bf0[24] + bf0[26]; |
2816 | 0 | bf1[25] = bf0[25] + bf0[27]; |
2817 | 0 | bf1[26] = -bf0[26] + bf0[24]; |
2818 | 0 | bf1[27] = -bf0[27] + bf0[25]; |
2819 | 0 | bf1[28] = bf0[28] + bf0[30]; |
2820 | 0 | bf1[29] = bf0[29] + bf0[31]; |
2821 | 0 | bf1[30] = -bf0[30] + bf0[28]; |
2822 | 0 | bf1[31] = -bf0[31] + bf0[29]; |
2823 | | |
2824 | | // stage 10 |
2825 | 0 | cospi = cospi_arr(cos_bit); |
2826 | 0 | bf0 = output; |
2827 | 0 | bf1 = step; |
2828 | 0 | bf1[0] = bf0[0]; |
2829 | 0 | bf1[1] = bf0[1]; |
2830 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
2831 | 0 | bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit); |
2832 | 0 | bf1[4] = bf0[4]; |
2833 | 0 | bf1[5] = bf0[5]; |
2834 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
2835 | 0 | bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit); |
2836 | 0 | bf1[8] = bf0[8]; |
2837 | 0 | bf1[9] = bf0[9]; |
2838 | 0 | bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); |
2839 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit); |
2840 | 0 | bf1[12] = bf0[12]; |
2841 | 0 | bf1[13] = bf0[13]; |
2842 | 0 | bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); |
2843 | 0 | bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit); |
2844 | 0 | bf1[16] = bf0[16]; |
2845 | 0 | bf1[17] = bf0[17]; |
2846 | 0 | bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit); |
2847 | 0 | bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit); |
2848 | 0 | bf1[20] = bf0[20]; |
2849 | 0 | bf1[21] = bf0[21]; |
2850 | 0 | bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit); |
2851 | 0 | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit); |
2852 | 0 | bf1[24] = bf0[24]; |
2853 | 0 | bf1[25] = bf0[25]; |
2854 | 0 | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit); |
2855 | 0 | bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit); |
2856 | 0 | bf1[28] = bf0[28]; |
2857 | 0 | bf1[29] = bf0[29]; |
2858 | 0 | bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit); |
2859 | 0 | bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit); |
2860 | | |
2861 | | // stage 11 |
2862 | 0 | bf0 = step; |
2863 | 0 | bf1 = output; |
2864 | 0 | bf1[0] = bf0[0]; |
2865 | 0 | bf1[1] = -bf0[16]; |
2866 | 0 | bf1[2] = bf0[24]; |
2867 | 0 | bf1[3] = -bf0[8]; |
2868 | 0 | bf1[4] = bf0[12]; |
2869 | 0 | bf1[5] = -bf0[28]; |
2870 | 0 | bf1[6] = bf0[20]; |
2871 | 0 | bf1[7] = -bf0[4]; |
2872 | 0 | bf1[8] = bf0[6]; |
2873 | 0 | bf1[9] = -bf0[22]; |
2874 | 0 | bf1[10] = bf0[30]; |
2875 | 0 | bf1[11] = -bf0[14]; |
2876 | 0 | bf1[12] = bf0[10]; |
2877 | 0 | bf1[13] = -bf0[26]; |
2878 | 0 | bf1[14] = bf0[18]; |
2879 | 0 | bf1[15] = -bf0[2]; |
2880 | 0 | bf1[16] = bf0[3]; |
2881 | 0 | bf1[17] = -bf0[19]; |
2882 | 0 | bf1[18] = bf0[27]; |
2883 | 0 | bf1[19] = -bf0[11]; |
2884 | 0 | bf1[20] = bf0[15]; |
2885 | 0 | bf1[21] = -bf0[31]; |
2886 | 0 | bf1[22] = bf0[23]; |
2887 | 0 | bf1[23] = -bf0[7]; |
2888 | 0 | bf1[24] = bf0[5]; |
2889 | 0 | bf1[25] = -bf0[21]; |
2890 | 0 | bf1[26] = bf0[29]; |
2891 | 0 | bf1[27] = -bf0[13]; |
2892 | 0 | bf1[28] = bf0[9]; |
2893 | 0 | bf1[29] = -bf0[25]; |
2894 | 0 | bf1[30] = bf0[17]; |
2895 | 0 | bf1[31] = -bf0[1]; |
2896 | 0 | } |
2897 | | |
2898 | 0 | void svt_av1_fidentity4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2899 | 0 | (void)stage_range; |
2900 | 0 | (void)cos_bit; |
2901 | 0 | for (int32_t i = 0; i < 4; ++i) { |
2902 | 0 | output[i] = round_shift((int64_t)input[i] * new_sqrt2, new_sqrt2_bits); |
2903 | 0 | } |
2904 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
2905 | 0 | } |
2906 | | |
2907 | 0 | void svt_av1_fidentity8_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2908 | 0 | (void)stage_range; |
2909 | 0 | (void)cos_bit; |
2910 | 0 | for (int32_t i = 0; i < 8; ++i) { |
2911 | 0 | output[i] = input[i] * 2; |
2912 | 0 | } |
2913 | 0 | } |
2914 | | |
2915 | 0 | void svt_av1_fidentity16_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2916 | 0 | (void)stage_range; |
2917 | 0 | (void)cos_bit; |
2918 | 0 | for (int32_t i = 0; i < 16; ++i) { |
2919 | 0 | output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits); |
2920 | 0 | } |
2921 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
2922 | 0 | } |
2923 | | |
2924 | 0 | void svt_av1_fidentity32_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2925 | 0 | (void)stage_range; |
2926 | 0 | (void)cos_bit; |
2927 | 0 | for (int32_t i = 0; i < 32; ++i) { |
2928 | 0 | output[i] = input[i] * 4; |
2929 | 0 | } |
2930 | 0 | } |
2931 | | |
2932 | 0 | static void av1_fidentity64_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
2933 | 0 | (void)stage_range; |
2934 | 0 | (void)cos_bit; |
2935 | 0 | for (int32_t i = 0; i < 64; ++i) { |
2936 | 0 | output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits); |
2937 | 0 | } |
2938 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
2939 | 0 | } |
2940 | | |
2941 | 60.3k | TxfmFunc svt_aom_fwd_txfm_type_to_func(TxfmType txfmtype) { |
2942 | 60.3k | switch (txfmtype) { |
2943 | 11.2k | case TXFM_TYPE_DCT4: |
2944 | 11.2k | return svt_av1_fdct4_new; |
2945 | 16.9k | case TXFM_TYPE_DCT8: |
2946 | 16.9k | return svt_av1_fdct8_new; |
2947 | 7.82k | case TXFM_TYPE_DCT16: |
2948 | 7.82k | return svt_av1_fdct16_new; |
2949 | 15.6k | case TXFM_TYPE_DCT32: |
2950 | 15.6k | return svt_av1_fdct32_new; |
2951 | 8.76k | case TXFM_TYPE_DCT64: |
2952 | 8.76k | return svt_av1_fdct64_new; |
2953 | 0 | case TXFM_TYPE_ADST4: |
2954 | 0 | return svt_av1_fadst4_new; |
2955 | 0 | case TXFM_TYPE_ADST8: |
2956 | 0 | return svt_av1_fadst8_new; |
2957 | 0 | case TXFM_TYPE_ADST16: |
2958 | 0 | return svt_av1_fadst16_new; |
2959 | 0 | case TXFM_TYPE_ADST32: |
2960 | 0 | return av1_fadst32_new; |
2961 | 0 | case TXFM_TYPE_IDENTITY4: |
2962 | 0 | return svt_av1_fidentity4_c; |
2963 | 0 | case TXFM_TYPE_IDENTITY8: |
2964 | 0 | return svt_av1_fidentity8_c; |
2965 | 0 | case TXFM_TYPE_IDENTITY16: |
2966 | 0 | return svt_av1_fidentity16_c; |
2967 | 0 | case TXFM_TYPE_IDENTITY32: |
2968 | 0 | return svt_av1_fidentity32_c; |
2969 | 0 | case TXFM_TYPE_IDENTITY64: |
2970 | 0 | return av1_fidentity64_c; |
2971 | 0 | default: |
2972 | 0 | assert(0); |
2973 | 0 | return NULL; |
2974 | 60.3k | } |
2975 | 60.3k | } |
2976 | | |
2977 | | //fwd_txfm2d_c |
2978 | | static INLINE void av1_tranform_two_d_core_c(int16_t* input, uint32_t input_stride, int32_t* output, |
2979 | 30.1k | const Txfm2dFlipCfg* cfg, int32_t* buf, uint8_t bit_depth) { |
2980 | 30.1k | int32_t c, r; |
2981 | | // Note when assigning txfm_size_col, we use the txfm_size from the |
2982 | | // row configuration and vice versa. This is intentionally done to |
2983 | | // accurately perform rectangular transforms. When the transform is |
2984 | | // rectangular, the number of columns will be the same as the |
2985 | | // txfm_size stored in the row cfg struct. It will make no difference |
2986 | | // for square transforms. |
2987 | 30.1k | const int32_t txfm_size_col = tx_size_wide[cfg->tx_size]; |
2988 | 30.1k | const int32_t txfm_size_row = tx_size_high[cfg->tx_size]; |
2989 | | // Take the shift from the larger dimension in the rectangular case. |
2990 | 30.1k | const int8_t* shift = cfg->shift; |
2991 | 30.1k | const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
2992 | 30.1k | int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; |
2993 | 30.1k | int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; |
2994 | 30.1k | assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); |
2995 | 30.1k | assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); |
2996 | 30.1k | svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth); |
2997 | | |
2998 | 30.1k | const int8_t cos_bit_col = cfg->cos_bit_col; |
2999 | 30.1k | const int8_t cos_bit_row = cfg->cos_bit_row; |
3000 | 30.1k | const TxfmFunc txfm_func_col = svt_aom_fwd_txfm_type_to_func(cfg->txfm_type_col); |
3001 | 30.1k | const TxfmFunc txfm_func_row = svt_aom_fwd_txfm_type_to_func(cfg->txfm_type_row); |
3002 | 30.1k | ASSERT(txfm_func_col != NULL); |
3003 | 30.1k | ASSERT(txfm_func_row != NULL); |
3004 | | // use output buffer as temp buffer |
3005 | 30.1k | int32_t* temp_in = output; |
3006 | 30.1k | int32_t* temp_out = output + txfm_size_row; |
3007 | | |
3008 | | // Columns |
3009 | 749k | for (c = 0; c < txfm_size_col; ++c) { |
3010 | 719k | if (cfg->ud_flip == 0) { |
3011 | 27.0M | for (r = 0; r < txfm_size_row; ++r) { |
3012 | 26.2M | temp_in[r] = input[r * input_stride + c]; |
3013 | 26.2M | } |
3014 | 18.4E | } else { |
3015 | 18.4E | for (r = 0; r < txfm_size_row; ++r) { |
3016 | | // flip upside down |
3017 | 0 | temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c]; |
3018 | 0 | } |
3019 | 18.4E | } |
3020 | 719k | svt_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c |
3021 | 719k | txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); |
3022 | 719k | svt_av1_round_shift_array_c(temp_out, txfm_size_row, -shift[1]); // NM svt_av1_round_shift_array_c |
3023 | 719k | if (cfg->lr_flip == 0) { |
3024 | 27.0M | for (r = 0; r < txfm_size_row; ++r) { |
3025 | 26.3M | buf[r * txfm_size_col + c] = temp_out[r]; |
3026 | 26.3M | } |
3027 | 18.4E | } else { |
3028 | 18.4E | for (r = 0; r < txfm_size_row; ++r) { |
3029 | | // flip from left to right |
3030 | 0 | buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; |
3031 | 0 | } |
3032 | 18.4E | } |
3033 | 719k | } |
3034 | | |
3035 | | // Rows |
3036 | 674k | for (r = 0; r < txfm_size_row; ++r) { |
3037 | 644k | txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row); |
3038 | 644k | svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col, -shift[2]); |
3039 | | |
3040 | 644k | if (abs(rect_type) == 1) { |
3041 | | // Multiply everything by Sqrt2 if the transform is rectangular and the |
3042 | | // size difference is a factor of 2. |
3043 | 4.85M | for (c = 0; c < txfm_size_col; ++c) { |
3044 | 4.78M | output[r * txfm_size_col + c] = round_shift((int64_t)output[r * txfm_size_col + c] * new_sqrt2, |
3045 | 4.78M | new_sqrt2_bits); |
3046 | 4.78M | } |
3047 | 74.7k | } |
3048 | 644k | } |
3049 | 30.1k | } |
3050 | | |
3051 | 30.1k | static INLINE void set_fwd_txfm_non_scale_range(Txfm2dFlipCfg* cfg) { |
3052 | 30.1k | av1_zero(cfg->stage_range_col); |
3053 | 30.1k | av1_zero(cfg->stage_range_row); |
3054 | | |
3055 | 30.1k | if (cfg->txfm_type_col == TXFM_TYPE_INVALID) { |
3056 | 0 | return; |
3057 | 0 | } |
3058 | | |
3059 | 30.1k | const int8_t* range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col]; |
3060 | 30.1k | const int32_t stage_num_col = MIN(cfg->stage_num_col, MAX_TXFM_STAGE_NUM); |
3061 | 262k | for (int32_t i = 0; i < stage_num_col; ++i) { |
3062 | 232k | cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1; |
3063 | 232k | } |
3064 | | |
3065 | 30.1k | if (cfg->txfm_type_row != TXFM_TYPE_INVALID) { |
3066 | 30.1k | const int8_t* range_mult2_row = fwd_txfm_range_mult2_list[cfg->txfm_type_row]; |
3067 | 30.1k | const int32_t stage_num_row = MIN(cfg->stage_num_row, MAX_TXFM_STAGE_NUM); |
3068 | 267k | for (int32_t i = 0; i < stage_num_row; ++i) { |
3069 | 237k | cfg->stage_range_row[i] = (range_mult2_col[cfg->stage_num_col - 1] + range_mult2_row[i] + 1) >> 1; |
3070 | 237k | } |
3071 | 30.1k | } |
3072 | 30.1k | } |
3073 | | |
3074 | 30.1k | void svt_aom_transform_config(TxType tx_type, TxSize tx_size, Txfm2dFlipCfg* cfg) { |
3075 | 30.1k | assert(cfg != NULL); |
3076 | 30.1k | cfg->tx_size = tx_size; |
3077 | 30.1k | set_flip_cfg(tx_type, cfg); |
3078 | 30.1k | const TxType1D tx_type_1d_col = vtx_tab[tx_type]; |
3079 | 30.1k | const TxType1D tx_type_1d_row = htx_tab[tx_type]; |
3080 | 30.1k | const int32_t txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]; |
3081 | 30.1k | const int32_t txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0]; |
3082 | 30.1k | cfg->shift = fwd_txfm_shift_ls[tx_size]; |
3083 | 30.1k | cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; |
3084 | 30.1k | cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; |
3085 | 30.1k | cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; |
3086 | 30.1k | cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; |
3087 | 30.1k | cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; |
3088 | 30.1k | cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; |
3089 | 30.1k | set_fwd_txfm_non_scale_range(cfg); |
3090 | 30.1k | } |
3091 | | |
3092 | 8.75k | static uint64_t energy_computation(int32_t* coeff, uint32_t coeff_stride, uint32_t area_width, uint32_t area_height) { |
3093 | 8.75k | uint64_t prediction_distortion = 0; |
3094 | | |
3095 | 288k | for (uint32_t row_index = 0; row_index < area_height; ++row_index) { |
3096 | 12.4M | for (uint32_t column_index = 0; column_index < area_width; ++column_index) { |
3097 | 12.2M | prediction_distortion += (int64_t)SQR((int64_t)(coeff[column_index])); |
3098 | 12.2M | } |
3099 | 279k | coeff += coeff_stride; |
3100 | 279k | } |
3101 | | |
3102 | 8.75k | return prediction_distortion; |
3103 | 8.75k | } |
3104 | | |
3105 | 3.21k | uint64_t svt_handle_transform64x64_c(int32_t* output) { |
3106 | 3.21k | uint64_t three_quad_energy; |
3107 | | |
3108 | | // top - right 32x32 area. |
3109 | 3.21k | three_quad_energy = energy_computation(output + 32, 64, 32, 32); |
3110 | | //bottom 64x32 area. |
3111 | 3.21k | three_quad_energy += energy_computation(output + 32 * 64, 64, 64, 32); |
3112 | | |
3113 | | // Re-pack non-zero coeffs in the first 32x32 indices. |
3114 | 102k | for (int32_t row = 1; row < 32; ++row) { |
3115 | 99.5k | svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output)); |
3116 | 99.5k | } |
3117 | | |
3118 | 3.21k | return three_quad_energy; |
3119 | 3.21k | } |
3120 | | |
3121 | | void svt_av1_transform_two_d_64x64_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3122 | 3.21k | uint8_t bit_depth) { |
3123 | 3.21k | int32_t intermediate_transform_buffer[64 * 64]; |
3124 | 3.21k | Txfm2dFlipCfg cfg; |
3125 | | //av1_get_fwd_txfm_cfg |
3126 | 3.21k | svt_aom_transform_config(transform_type, TX_64X64, &cfg); |
3127 | | //fwd_txfm2d_c |
3128 | 3.21k | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3129 | 3.21k | } |
3130 | | |
3131 | | void svt_av1_transform_two_d_32x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3132 | 6.63k | uint8_t bit_depth) { |
3133 | 6.63k | int32_t intermediate_transform_buffer[32 * 32]; |
3134 | 6.63k | Txfm2dFlipCfg cfg; |
3135 | | |
3136 | 6.63k | svt_aom_transform_config(transform_type, TX_32X32, &cfg); |
3137 | | |
3138 | 6.63k | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3139 | 6.63k | } |
3140 | | |
3141 | | void svt_av1_transform_two_d_16x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3142 | 3.91k | uint8_t bit_depth) { |
3143 | 3.91k | int32_t intermediate_transform_buffer[16 * 16]; |
3144 | 3.91k | Txfm2dFlipCfg cfg; |
3145 | | |
3146 | 3.91k | svt_aom_transform_config(transform_type, TX_16X16, &cfg); |
3147 | | |
3148 | 3.91k | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3149 | 3.91k | } |
3150 | | |
3151 | | void svt_av1_transform_two_d_8x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3152 | 8.44k | uint8_t bit_depth) { |
3153 | 8.44k | int32_t intermediate_transform_buffer[8 * 8]; |
3154 | 8.44k | Txfm2dFlipCfg cfg; |
3155 | | |
3156 | 8.44k | svt_aom_transform_config(transform_type, TX_8X8, &cfg); |
3157 | | |
3158 | 8.44k | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3159 | 8.44k | } |
3160 | | |
3161 | | void svt_av1_transform_two_d_4x4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3162 | 5.62k | uint8_t bit_depth) { |
3163 | 5.62k | int32_t intermediate_transform_buffer[4 * 4]; |
3164 | 5.62k | Txfm2dFlipCfg cfg; |
3165 | | |
3166 | 5.62k | svt_aom_transform_config(transform_type, TX_4X4, &cfg); |
3167 | | |
3168 | 5.62k | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3169 | 5.62k | } |
3170 | | |
3171 | | /********************************************************************* |
3172 | | * Calculate CBF |
3173 | | *********************************************************************/ |
3174 | | void svt_av1_fwd_txfm2d_64x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3175 | 2.33k | uint8_t bit_depth) { |
3176 | 2.33k | int32_t intermediate_transform_buffer[64 * 32]; |
3177 | 2.33k | Txfm2dFlipCfg cfg; |
3178 | | /*av1_get_fwd_txfm_cfg*/ |
3179 | 2.33k | svt_aom_transform_config(transform_type, TX_64X32, &cfg); |
3180 | 2.33k | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3181 | 2.33k | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3182 | 2.33k | } |
3183 | | |
3184 | 2.33k | uint64_t svt_handle_transform64x32_c(int32_t* output) { |
3185 | | // top - right 32x32 area. |
3186 | 2.33k | const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 32); |
3187 | | |
3188 | | // Re-pack non-zero coeffs in the first 32x32 indices. |
3189 | 74.7k | for (int32_t row = 1; row < 32; ++row) { |
3190 | 72.3k | svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output)); |
3191 | 72.3k | } |
3192 | | |
3193 | 2.33k | return three_quad_energy; |
3194 | 2.33k | } |
3195 | | |
3196 | | void svt_av1_fwd_txfm2d_32x64_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3197 | 0 | uint8_t bit_depth) { |
3198 | 0 | int32_t intermediate_transform_buffer[32 * 64]; |
3199 | |
|
3200 | 0 | Txfm2dFlipCfg cfg; |
3201 | | /*av1_get_fwd_txfm_cfg*/ |
3202 | 0 | svt_aom_transform_config(transform_type, TX_32X64, &cfg); |
3203 | | /*fwd_txfm2d_c*/ |
3204 | 0 | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3205 | 0 | } |
3206 | | |
3207 | 0 | uint64_t svt_handle_transform32x64_c(int32_t* output) { |
3208 | | //bottom 32x32 area. |
3209 | 0 | const uint64_t three_quad_energy = energy_computation(output + 32 * 32, 32, 32, 32); |
3210 | 0 | return three_quad_energy; |
3211 | 0 | } |
3212 | | |
3213 | | void svt_av1_fwd_txfm2d_64x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3214 | 0 | uint8_t bit_depth) { |
3215 | 0 | int32_t intermediate_transform_buffer[64 * 16]; |
3216 | 0 | Txfm2dFlipCfg cfg; |
3217 | | /*av1_get_fwd_txfm_cfg*/ |
3218 | 0 | svt_aom_transform_config(transform_type, TX_64X16, &cfg); |
3219 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3220 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3221 | 0 | } |
3222 | | |
3223 | 0 | uint64_t svt_handle_transform64x16_c(int32_t* output) { |
3224 | | // top - right 32x16 area. |
3225 | 0 | const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 16); |
3226 | | |
3227 | | // Re-pack non-zero coeffs in the first 32x16 indices. |
3228 | 0 | for (int32_t row = 1; row < 16; ++row) { |
3229 | 0 | svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output)); |
3230 | 0 | } |
3231 | |
|
3232 | 0 | return three_quad_energy; |
3233 | 0 | } |
3234 | | |
3235 | | void svt_av1_fwd_txfm2d_16x64_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3236 | 0 | uint8_t bit_depth) { |
3237 | 0 | int32_t intermediate_transform_buffer[16 * 64]; |
3238 | |
|
3239 | 0 | Txfm2dFlipCfg cfg; |
3240 | | /*av1_get_fwd_txfm_cfg*/ |
3241 | 0 | svt_aom_transform_config(transform_type, TX_16X64, &cfg); |
3242 | | /*fwd_txfm2d_c*/ |
3243 | 0 | av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3244 | 0 | } |
3245 | | |
3246 | 0 | uint64_t svt_handle_transform16x64_c(int32_t* output) { |
3247 | | //bottom 16x32 area. |
3248 | 0 | const uint64_t three_quad_energy = energy_computation(output + 16 * 32, 16, 16, 32); |
3249 | 0 | return three_quad_energy; |
3250 | 0 | } |
3251 | | |
3252 | 0 | uint64_t svt_handle_transform16x64_N2_N4_c(int32_t* output) { |
3253 | 0 | (void)output; |
3254 | 0 | return 0; |
3255 | 0 | } |
3256 | | |
3257 | 0 | uint64_t svt_handle_transform32x64_N2_N4_c(int32_t* output) { |
3258 | 0 | (void)output; |
3259 | 0 | return 0; |
3260 | 0 | } |
3261 | | |
3262 | 0 | uint64_t svt_handle_transform64x16_N2_N4_c(int32_t* output) { |
3263 | | // Re-pack non-zero coeffs in the first 32x16 indices. |
3264 | 0 | for (int32_t row = 1; row < 16; ++row) { |
3265 | 0 | svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output)); |
3266 | 0 | } |
3267 | |
|
3268 | 0 | return 0; |
3269 | 0 | } |
3270 | | |
3271 | 0 | uint64_t svt_handle_transform64x32_N2_N4_c(int32_t* output) { |
3272 | | // Re-pack non-zero coeffs in the first 32x32 indices. |
3273 | 0 | for (int32_t row = 1; row < 32; ++row) { |
3274 | 0 | svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output)); |
3275 | 0 | } |
3276 | |
|
3277 | 0 | return 0; |
3278 | 0 | } |
3279 | | |
3280 | 0 | uint64_t svt_handle_transform64x64_N2_N4_c(int32_t* output) { |
3281 | | // Re-pack non-zero coeffs in the first 32x32 indices. |
3282 | 0 | for (int32_t row = 1; row < 32; ++row) { |
3283 | 0 | svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output)); |
3284 | 0 | } |
3285 | |
|
3286 | 0 | return 0; |
3287 | 0 | } |
3288 | | |
3289 | | void svt_av1_fwd_txfm2d_32x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3290 | 0 | uint8_t bit_depth) { |
3291 | 0 | int32_t intermediate_transform_buffer[32 * 16]; |
3292 | 0 | Txfm2dFlipCfg cfg; |
3293 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_32X16, &cfg); |
3294 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3295 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3296 | 0 | } |
3297 | | |
3298 | | void svt_av1_fwd_txfm2d_16x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3299 | 0 | uint8_t bit_depth) { |
3300 | 0 | int32_t intermediate_transform_buffer[16 * 32]; |
3301 | 0 | Txfm2dFlipCfg cfg; |
3302 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_16X32, &cfg); |
3303 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3304 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3305 | 0 | } |
3306 | | |
3307 | | void svt_av1_fwd_txfm2d_16x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3308 | 0 | uint8_t bit_depth) { |
3309 | 0 | int32_t intermediate_transform_buffer[16 * 8]; |
3310 | 0 | Txfm2dFlipCfg cfg; |
3311 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_16X8, &cfg); |
3312 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3313 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3314 | 0 | } |
3315 | | |
3316 | | void svt_av1_fwd_txfm2d_8x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3317 | 0 | uint8_t bit_depth) { |
3318 | 0 | int32_t intermediate_transform_buffer[8 * 16]; |
3319 | 0 | Txfm2dFlipCfg cfg; |
3320 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_8X16, &cfg); |
3321 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3322 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3323 | 0 | } |
3324 | | |
3325 | | void svt_av1_fwd_txfm2d_32x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3326 | 0 | uint8_t bit_depth) { |
3327 | 0 | int32_t intermediate_transform_buffer[32 * 8]; |
3328 | 0 | Txfm2dFlipCfg cfg; |
3329 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_32X8, &cfg); |
3330 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3331 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3332 | 0 | } |
3333 | | |
3334 | | void svt_av1_fwd_txfm2d_8x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3335 | 0 | uint8_t bit_depth) { |
3336 | 0 | int32_t intermediate_transform_buffer[8 * 32]; |
3337 | 0 | Txfm2dFlipCfg cfg; |
3338 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_8X32, &cfg); |
3339 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3340 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3341 | 0 | } |
3342 | | |
3343 | | void svt_av1_fwd_txfm2d_16x4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3344 | 0 | uint8_t bit_depth) { |
3345 | 0 | int32_t intermediate_transform_buffer[16 * 4]; |
3346 | 0 | Txfm2dFlipCfg cfg; |
3347 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_16X4, &cfg); |
3348 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3349 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3350 | 0 | } |
3351 | | |
3352 | | void svt_av1_fwd_txfm2d_4x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3353 | 0 | uint8_t bit_depth) { |
3354 | 0 | int32_t intermediate_transform_buffer[4 * 16]; |
3355 | 0 | Txfm2dFlipCfg cfg; |
3356 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_4X16, &cfg); |
3357 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3358 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3359 | 0 | } |
3360 | | |
3361 | | void svt_av1_fwd_txfm2d_8x4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3362 | 0 | uint8_t bit_depth) { |
3363 | 0 | int32_t intermediate_transform_buffer[8 * 4]; |
3364 | 0 | Txfm2dFlipCfg cfg; |
3365 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_8X4, &cfg); |
3366 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3367 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3368 | 0 | } |
3369 | | |
3370 | | void svt_av1_fwd_txfm2d_4x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
3371 | 0 | uint8_t bit_depth) { |
3372 | 0 | int32_t intermediate_transform_buffer[4 * 8]; |
3373 | 0 | Txfm2dFlipCfg cfg; |
3374 | 0 | /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_4X8, &cfg); |
3375 | 0 | /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c( |
3376 | 0 | input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
3377 | 0 | } |
3378 | | |
3379 | | static EbErrorType av1_estimate_transform_N2(int16_t* residual_buffer, uint32_t residual_stride, int32_t* coeff_buffer, |
3380 | | uint32_t coeff_stride, TxSize transform_size, uint64_t* three_quad_energy, |
3381 | | uint32_t bit_depth, TxType transform_type, PlaneType component_type) |
3382 | | |
3383 | 0 | { |
3384 | 0 | EbErrorType return_error = EB_ErrorNone; |
3385 | |
|
3386 | 0 | (void)coeff_stride; |
3387 | 0 | (void)component_type; |
3388 | |
|
3389 | 0 | switch (transform_size) { |
3390 | 0 | case TX_64X32: |
3391 | 0 | if (transform_type == DCT_DCT) { |
3392 | 0 | svt_av1_fwd_txfm2d_64x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3393 | 0 | } else { |
3394 | 0 | svt_av1_fwd_txfm2d_64x32_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3395 | 0 | } |
3396 | |
|
3397 | 0 | *three_quad_energy = svt_handle_transform64x32_N2_N4(coeff_buffer); |
3398 | |
|
3399 | 0 | break; |
3400 | | |
3401 | 0 | case TX_32X64: |
3402 | 0 | if (transform_type == DCT_DCT) { |
3403 | 0 | svt_av1_fwd_txfm2d_32x64_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3404 | 0 | } else { |
3405 | 0 | svt_av1_fwd_txfm2d_32x64_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3406 | 0 | } |
3407 | |
|
3408 | 0 | *three_quad_energy = svt_handle_transform32x64_N2_N4(coeff_buffer); |
3409 | |
|
3410 | 0 | break; |
3411 | | |
3412 | 0 | case TX_64X16: |
3413 | 0 | if (transform_type == DCT_DCT) { |
3414 | 0 | svt_av1_fwd_txfm2d_64x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3415 | 0 | } else { |
3416 | 0 | svt_av1_fwd_txfm2d_64x16_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3417 | 0 | } |
3418 | |
|
3419 | 0 | *three_quad_energy = svt_handle_transform64x16_N2_N4(coeff_buffer); |
3420 | |
|
3421 | 0 | break; |
3422 | | |
3423 | 0 | case TX_16X64: |
3424 | 0 | if (transform_type == DCT_DCT) { |
3425 | 0 | svt_av1_fwd_txfm2d_16x64_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3426 | 0 | } else { |
3427 | 0 | svt_av1_fwd_txfm2d_16x64_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3428 | 0 | } |
3429 | |
|
3430 | 0 | *three_quad_energy = svt_handle_transform16x64_N2_N4(coeff_buffer); |
3431 | |
|
3432 | 0 | break; |
3433 | | |
3434 | 0 | case TX_32X16: |
3435 | | // TTK |
3436 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3437 | 0 | svt_av1_fwd_txfm2d_32x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3438 | 0 | } else { |
3439 | 0 | svt_av1_fwd_txfm2d_32x16_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3440 | 0 | } |
3441 | 0 | break; |
3442 | | |
3443 | 0 | case TX_16X32: |
3444 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3445 | 0 | svt_av1_fwd_txfm2d_16x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3446 | 0 | } else { |
3447 | 0 | svt_av1_fwd_txfm2d_16x32_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3448 | 0 | } |
3449 | 0 | break; |
3450 | | |
3451 | 0 | case TX_16X8: |
3452 | 0 | svt_av1_fwd_txfm2d_16x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3453 | 0 | break; |
3454 | | |
3455 | 0 | case TX_8X16: |
3456 | 0 | svt_av1_fwd_txfm2d_8x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3457 | 0 | break; |
3458 | | |
3459 | 0 | case TX_32X8: |
3460 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3461 | 0 | svt_av1_fwd_txfm2d_32x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3462 | 0 | } else { |
3463 | 0 | svt_av1_fwd_txfm2d_32x8_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3464 | 0 | } |
3465 | 0 | break; |
3466 | | |
3467 | 0 | case TX_8X32: |
3468 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3469 | 0 | svt_av1_fwd_txfm2d_8x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3470 | 0 | } else { |
3471 | 0 | svt_av1_fwd_txfm2d_8x32_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3472 | 0 | } |
3473 | 0 | break; |
3474 | 0 | case TX_16X4: |
3475 | 0 | svt_av1_fwd_txfm2d_16x4_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3476 | 0 | break; |
3477 | 0 | case TX_4X16: |
3478 | 0 | svt_av1_fwd_txfm2d_4x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3479 | 0 | break; |
3480 | 0 | case TX_8X4: |
3481 | |
|
3482 | 0 | svt_av1_fwd_txfm2d_8x4_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3483 | |
|
3484 | 0 | break; |
3485 | 0 | case TX_4X8: |
3486 | |
|
3487 | 0 | svt_av1_fwd_txfm2d_4x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3488 | |
|
3489 | 0 | break; |
3490 | | |
3491 | 0 | case TX_64X64: |
3492 | |
|
3493 | 0 | svt_av1_fwd_txfm2d_64x64_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3494 | |
|
3495 | 0 | *three_quad_energy = svt_handle_transform64x64_N2_N4(coeff_buffer); |
3496 | |
|
3497 | 0 | break; |
3498 | | |
3499 | 0 | case TX_32X32: |
3500 | 0 | if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST || |
3501 | 0 | transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST) { |
3502 | | // Tahani: I believe those cases are never hit |
3503 | 0 | svt_aom_transform_two_d_32x32_N2_c( |
3504 | 0 | residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3505 | 0 | } |
3506 | | |
3507 | 0 | else { |
3508 | 0 | svt_av1_fwd_txfm2d_32x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3509 | 0 | } |
3510 | |
|
3511 | 0 | break; |
3512 | | |
3513 | 0 | case TX_16X16: |
3514 | |
|
3515 | 0 | svt_av1_fwd_txfm2d_16x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3516 | |
|
3517 | 0 | break; |
3518 | 0 | case TX_8X8: |
3519 | |
|
3520 | 0 | svt_av1_fwd_txfm2d_8x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3521 | |
|
3522 | 0 | break; |
3523 | 0 | case TX_4X4: |
3524 | |
|
3525 | 0 | svt_av1_fwd_txfm2d_4x4_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3526 | |
|
3527 | 0 | break; |
3528 | 0 | default: |
3529 | 0 | assert(0); |
3530 | 0 | break; |
3531 | 0 | } |
3532 | | |
3533 | 0 | return return_error; |
3534 | 0 | } |
3535 | | |
3536 | | static EbErrorType av1_estimate_transform_N4(int16_t* residual_buffer, uint32_t residual_stride, int32_t* coeff_buffer, |
3537 | | uint32_t coeff_stride, TxSize transform_size, uint64_t* three_quad_energy, |
3538 | | uint32_t bit_depth, TxType transform_type, PlaneType component_type) |
3539 | | |
3540 | 0 | { |
3541 | 0 | EbErrorType return_error = EB_ErrorNone; |
3542 | |
|
3543 | 0 | (void)coeff_stride; |
3544 | 0 | (void)component_type; |
3545 | |
|
3546 | 0 | switch (transform_size) { |
3547 | 0 | case TX_64X32: |
3548 | 0 | if (transform_type == DCT_DCT) { |
3549 | 0 | svt_av1_fwd_txfm2d_64x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3550 | 0 | } else { |
3551 | 0 | svt_av1_fwd_txfm2d_64x32_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3552 | 0 | } |
3553 | |
|
3554 | 0 | *three_quad_energy = svt_handle_transform64x32_N2_N4(coeff_buffer); |
3555 | |
|
3556 | 0 | break; |
3557 | | |
3558 | 0 | case TX_32X64: |
3559 | 0 | if (transform_type == DCT_DCT) { |
3560 | 0 | svt_av1_fwd_txfm2d_32x64_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3561 | 0 | } else { |
3562 | 0 | svt_av1_fwd_txfm2d_32x64_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3563 | 0 | } |
3564 | |
|
3565 | 0 | *three_quad_energy = svt_handle_transform32x64_N2_N4(coeff_buffer); |
3566 | |
|
3567 | 0 | break; |
3568 | | |
3569 | 0 | case TX_64X16: |
3570 | 0 | if (transform_type == DCT_DCT) { |
3571 | 0 | svt_av1_fwd_txfm2d_64x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3572 | 0 | } else { |
3573 | 0 | svt_av1_fwd_txfm2d_64x16_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3574 | 0 | } |
3575 | |
|
3576 | 0 | *three_quad_energy = svt_handle_transform64x16_N2_N4(coeff_buffer); |
3577 | |
|
3578 | 0 | break; |
3579 | | |
3580 | 0 | case TX_16X64: |
3581 | 0 | if (transform_type == DCT_DCT) { |
3582 | 0 | svt_av1_fwd_txfm2d_16x64_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3583 | 0 | } else { |
3584 | 0 | svt_av1_fwd_txfm2d_16x64_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3585 | 0 | } |
3586 | |
|
3587 | 0 | *three_quad_energy = svt_handle_transform16x64_N2_N4(coeff_buffer); |
3588 | |
|
3589 | 0 | break; |
3590 | | |
3591 | 0 | case TX_32X16: |
3592 | | // TTK |
3593 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3594 | 0 | svt_av1_fwd_txfm2d_32x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3595 | 0 | } else { |
3596 | 0 | svt_av1_fwd_txfm2d_32x16_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3597 | 0 | } |
3598 | 0 | break; |
3599 | | |
3600 | 0 | case TX_16X32: |
3601 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3602 | 0 | svt_av1_fwd_txfm2d_16x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3603 | 0 | } else { |
3604 | 0 | svt_av1_fwd_txfm2d_16x32_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3605 | 0 | } |
3606 | 0 | break; |
3607 | | |
3608 | 0 | case TX_16X8: |
3609 | 0 | svt_av1_fwd_txfm2d_16x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3610 | 0 | break; |
3611 | | |
3612 | 0 | case TX_8X16: |
3613 | 0 | svt_av1_fwd_txfm2d_8x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3614 | 0 | break; |
3615 | | |
3616 | 0 | case TX_32X8: |
3617 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3618 | 0 | svt_av1_fwd_txfm2d_32x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3619 | 0 | } else { |
3620 | 0 | svt_av1_fwd_txfm2d_32x8_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3621 | 0 | } |
3622 | 0 | break; |
3623 | | |
3624 | 0 | case TX_8X32: |
3625 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3626 | 0 | svt_av1_fwd_txfm2d_8x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3627 | 0 | } else { |
3628 | 0 | svt_av1_fwd_txfm2d_8x32_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3629 | 0 | } |
3630 | 0 | break; |
3631 | 0 | case TX_16X4: |
3632 | 0 | svt_av1_fwd_txfm2d_16x4_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3633 | 0 | break; |
3634 | 0 | case TX_4X16: |
3635 | 0 | svt_av1_fwd_txfm2d_4x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3636 | 0 | break; |
3637 | 0 | case TX_8X4: |
3638 | |
|
3639 | 0 | svt_av1_fwd_txfm2d_8x4_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3640 | |
|
3641 | 0 | break; |
3642 | 0 | case TX_4X8: |
3643 | |
|
3644 | 0 | svt_av1_fwd_txfm2d_4x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3645 | |
|
3646 | 0 | break; |
3647 | | |
3648 | 0 | case TX_64X64: |
3649 | |
|
3650 | 0 | svt_av1_fwd_txfm2d_64x64_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3651 | |
|
3652 | 0 | *three_quad_energy = svt_handle_transform64x64_N2_N4(coeff_buffer); |
3653 | |
|
3654 | 0 | break; |
3655 | | |
3656 | 0 | case TX_32X32: |
3657 | 0 | if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST || |
3658 | 0 | transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST) { |
3659 | | // Tahani: I believe those cases are never hit |
3660 | 0 | svt_aom_transform_two_d_32x32_N4_c( |
3661 | 0 | residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3662 | 0 | } |
3663 | | |
3664 | 0 | else { |
3665 | 0 | svt_av1_fwd_txfm2d_32x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3666 | 0 | } |
3667 | |
|
3668 | 0 | break; |
3669 | | |
3670 | 0 | case TX_16X16: |
3671 | |
|
3672 | 0 | svt_av1_fwd_txfm2d_16x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3673 | |
|
3674 | 0 | break; |
3675 | 0 | case TX_8X8: |
3676 | |
|
3677 | 0 | svt_av1_fwd_txfm2d_8x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3678 | |
|
3679 | 0 | break; |
3680 | 0 | case TX_4X4: |
3681 | |
|
3682 | 0 | svt_av1_fwd_txfm2d_4x4_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3683 | |
|
3684 | 0 | break; |
3685 | 0 | default: |
3686 | 0 | assert(0); |
3687 | 0 | break; |
3688 | 0 | } |
3689 | | |
3690 | 0 | return return_error; |
3691 | 0 | } |
3692 | | |
3693 | | static EbErrorType av1_estimate_transform_ONLY_DC(int16_t* residual_buffer, uint32_t residual_stride, |
3694 | | int32_t* coeff_buffer, uint32_t coeff_stride, TxSize transform_size, |
3695 | | uint64_t* three_quad_energy, uint32_t bit_depth, |
3696 | | TxType transform_type, PlaneType component_type) |
3697 | | |
3698 | 0 | { |
3699 | 0 | EbErrorType return_error = av1_estimate_transform_N4(residual_buffer, |
3700 | 0 | residual_stride, |
3701 | 0 | coeff_buffer, |
3702 | 0 | coeff_stride, |
3703 | 0 | transform_size, |
3704 | 0 | three_quad_energy, |
3705 | 0 | bit_depth, |
3706 | 0 | transform_type, |
3707 | 0 | component_type); |
3708 | |
|
3709 | 0 | for (int i = 1; i < (tx_size_wide[transform_size] * tx_size_high[transform_size]); i++) { |
3710 | 0 | if (i % tx_size_wide[transform_size] < (tx_size_wide[transform_size] >> 2) || |
3711 | 0 | i / tx_size_wide[transform_size] < (tx_size_high[transform_size] >> 2)) { |
3712 | 0 | coeff_buffer[i] = 0; |
3713 | 0 | } |
3714 | 0 | } |
3715 | 0 | return return_error; |
3716 | 0 | } |
3717 | | |
3718 | | static EbErrorType av1_estimate_transform_default(int16_t* residual_buffer, uint32_t residual_stride, |
3719 | | int32_t* coeff_buffer, uint32_t coeff_stride, TxSize transform_size, |
3720 | | uint64_t* three_quad_energy, uint32_t bit_depth, |
3721 | | TxType transform_type, PlaneType component_type) |
3722 | | |
3723 | 30.1k | { |
3724 | 30.1k | EbErrorType return_error = EB_ErrorNone; |
3725 | | |
3726 | 30.1k | (void)coeff_stride; |
3727 | 30.1k | (void)component_type; |
3728 | | |
3729 | 30.1k | switch (transform_size) { |
3730 | 2.33k | case TX_64X32: |
3731 | 2.33k | if (transform_type == DCT_DCT) { |
3732 | 2.33k | svt_av1_fwd_txfm2d_64x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3733 | 2.33k | } else { |
3734 | 0 | svt_av1_fwd_txfm2d_64x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3735 | 0 | } |
3736 | | |
3737 | 2.33k | *three_quad_energy = svt_handle_transform64x32(coeff_buffer); |
3738 | | |
3739 | 2.33k | break; |
3740 | | |
3741 | 0 | case TX_32X64: |
3742 | 0 | if (transform_type == DCT_DCT) { |
3743 | 0 | svt_av1_fwd_txfm2d_32x64(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3744 | 0 | } else { |
3745 | 0 | svt_av1_fwd_txfm2d_32x64_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3746 | 0 | } |
3747 | |
|
3748 | 0 | *three_quad_energy = svt_handle_transform32x64(coeff_buffer); |
3749 | |
|
3750 | 0 | break; |
3751 | | |
3752 | 0 | case TX_64X16: |
3753 | 0 | if (transform_type == DCT_DCT) { |
3754 | 0 | svt_av1_fwd_txfm2d_64x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3755 | 0 | } else { |
3756 | 0 | svt_av1_fwd_txfm2d_64x16_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3757 | 0 | } |
3758 | |
|
3759 | 0 | *three_quad_energy = svt_handle_transform64x16(coeff_buffer); |
3760 | |
|
3761 | 0 | break; |
3762 | | |
3763 | 0 | case TX_16X64: |
3764 | 0 | if (transform_type == DCT_DCT) { |
3765 | 0 | svt_av1_fwd_txfm2d_16x64(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3766 | 0 | } else { |
3767 | 0 | svt_av1_fwd_txfm2d_16x64_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3768 | 0 | } |
3769 | |
|
3770 | 0 | *three_quad_energy = svt_handle_transform16x64(coeff_buffer); |
3771 | |
|
3772 | 0 | break; |
3773 | | |
3774 | 0 | case TX_32X16: |
3775 | | // TTK |
3776 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3777 | 0 | svt_av1_fwd_txfm2d_32x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3778 | 0 | } else { |
3779 | 0 | svt_av1_fwd_txfm2d_32x16_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3780 | 0 | } |
3781 | 0 | break; |
3782 | | |
3783 | 0 | case TX_16X32: |
3784 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3785 | 0 | svt_av1_fwd_txfm2d_16x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3786 | 0 | } else { |
3787 | 0 | svt_av1_fwd_txfm2d_16x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3788 | 0 | } |
3789 | 0 | break; |
3790 | | |
3791 | 0 | case TX_16X8: |
3792 | 0 | svt_av1_fwd_txfm2d_16x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3793 | 0 | break; |
3794 | | |
3795 | 0 | case TX_8X16: |
3796 | 0 | svt_av1_fwd_txfm2d_8x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3797 | 0 | break; |
3798 | | |
3799 | 0 | case TX_32X8: |
3800 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3801 | 0 | svt_av1_fwd_txfm2d_32x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3802 | 0 | } else { |
3803 | 0 | svt_av1_fwd_txfm2d_32x8_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3804 | 0 | } |
3805 | 0 | break; |
3806 | | |
3807 | 0 | case TX_8X32: |
3808 | 0 | if ((transform_type == DCT_DCT) || (transform_type == IDTX)) { |
3809 | 0 | svt_av1_fwd_txfm2d_8x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3810 | 0 | } else { |
3811 | 0 | svt_av1_fwd_txfm2d_8x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3812 | 0 | } |
3813 | 0 | break; |
3814 | 0 | case TX_16X4: |
3815 | 0 | svt_av1_fwd_txfm2d_16x4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3816 | 0 | break; |
3817 | 0 | case TX_4X16: |
3818 | 0 | svt_av1_fwd_txfm2d_4x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3819 | 0 | break; |
3820 | 0 | case TX_8X4: |
3821 | |
|
3822 | 0 | svt_av1_fwd_txfm2d_8x4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3823 | |
|
3824 | 0 | break; |
3825 | 0 | case TX_4X8: |
3826 | |
|
3827 | 0 | svt_av1_fwd_txfm2d_4x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3828 | |
|
3829 | 0 | break; |
3830 | | |
3831 | 3.21k | case TX_64X64: |
3832 | | |
3833 | 3.21k | svt_av1_fwd_txfm2d_64x64(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3834 | | |
3835 | 3.21k | *three_quad_energy = svt_handle_transform64x64(coeff_buffer); |
3836 | | |
3837 | 3.21k | break; |
3838 | | |
3839 | 6.63k | case TX_32X32: |
3840 | 6.63k | if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST || |
3841 | 6.63k | transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST) { |
3842 | | // Tahani: I believe those cases are never hit |
3843 | 0 | svt_av1_transform_two_d_32x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3844 | 0 | } |
3845 | | |
3846 | 6.63k | else { |
3847 | 6.63k | svt_av1_fwd_txfm2d_32x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3848 | 6.63k | } |
3849 | | |
3850 | 6.63k | break; |
3851 | | |
3852 | 3.91k | case TX_16X16: |
3853 | | |
3854 | 3.91k | svt_av1_fwd_txfm2d_16x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3855 | | |
3856 | 3.91k | break; |
3857 | 8.44k | case TX_8X8: |
3858 | | |
3859 | 8.44k | svt_av1_fwd_txfm2d_8x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3860 | | |
3861 | 8.44k | break; |
3862 | 5.62k | case TX_4X4: |
3863 | | |
3864 | 5.62k | svt_av1_fwd_txfm2d_4x4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth); |
3865 | | |
3866 | 5.62k | break; |
3867 | 0 | default: |
3868 | 0 | assert(0); |
3869 | 0 | break; |
3870 | 30.1k | } |
3871 | | |
3872 | 30.1k | return return_error; |
3873 | 30.1k | } |
3874 | | |
3875 | | /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
3876 | | pixel. |
3877 | | Shared for both high and low bit depth. |
3878 | | */ |
3879 | 1.20M | void svt_av1_fwht4x4_c(int16_t* input, int32_t* output, uint32_t stride) { |
3880 | 1.20M | int i; |
3881 | 1.20M | int64_t a1, b1, c1, d1, e1; |
3882 | 1.20M | const int16_t* ip_pass0 = input; |
3883 | 1.20M | const int32_t* ip = NULL; |
3884 | 1.20M | int32_t* op = output; |
3885 | | |
3886 | 5.99M | for (i = 0; i < 4; i++) { |
3887 | 4.79M | a1 = ip_pass0[0 * stride]; |
3888 | 4.79M | b1 = ip_pass0[1 * stride]; |
3889 | 4.79M | c1 = ip_pass0[2 * stride]; |
3890 | 4.79M | d1 = ip_pass0[3 * stride]; |
3891 | | |
3892 | 4.79M | a1 += b1; |
3893 | 4.79M | d1 = d1 - c1; |
3894 | 4.79M | e1 = (a1 - d1) >> 1; |
3895 | 4.79M | b1 = e1 - b1; |
3896 | 4.79M | c1 = e1 - c1; |
3897 | 4.79M | a1 -= c1; |
3898 | 4.79M | d1 += b1; |
3899 | 4.79M | op[0] = (int32_t)a1; |
3900 | 4.79M | op[1] = (int32_t)c1; |
3901 | 4.79M | op[2] = (int32_t)d1; |
3902 | 4.79M | op[3] = (int32_t)b1; |
3903 | | |
3904 | 4.79M | ip_pass0++; |
3905 | 4.79M | op += 4; |
3906 | 4.79M | } |
3907 | 1.20M | ip = output; |
3908 | 1.20M | op = output; |
3909 | | |
3910 | 5.97M | for (i = 0; i < 4; i++) { |
3911 | 4.77M | a1 = ip[4 * 0]; |
3912 | 4.77M | b1 = ip[4 * 1]; |
3913 | 4.77M | c1 = ip[4 * 2]; |
3914 | 4.77M | d1 = ip[4 * 3]; |
3915 | | |
3916 | 4.77M | a1 += b1; |
3917 | 4.77M | d1 -= c1; |
3918 | 4.77M | e1 = (a1 - d1) >> 1; |
3919 | 4.77M | b1 = e1 - b1; |
3920 | 4.77M | c1 = e1 - c1; |
3921 | 4.77M | a1 -= c1; |
3922 | 4.77M | d1 += b1; |
3923 | 4.77M | op[4 * 0] = (int32_t)(a1 * UNIT_QUANT_FACTOR); |
3924 | 4.77M | op[4 * 1] = (int32_t)(c1 * UNIT_QUANT_FACTOR); |
3925 | 4.77M | op[4 * 2] = (int32_t)(d1 * UNIT_QUANT_FACTOR); |
3926 | 4.77M | op[4 * 3] = (int32_t)(b1 * UNIT_QUANT_FACTOR); |
3927 | | |
3928 | 4.77M | ip++; |
3929 | 4.77M | op++; |
3930 | 4.77M | } |
3931 | 1.20M | } |
3932 | | |
3933 | | /********************************************************************* |
3934 | | * Transform |
3935 | | * Note there is an implicit assumption that TU Size <= PU Size, |
3936 | | * which is different than the HEVC requirements. |
3937 | | *********************************************************************/ |
3938 | | EbErrorType svt_aom_estimate_transform(PictureControlSet* pcs, ModeDecisionContext* ctx, int16_t* residual_buffer, |
3939 | | uint32_t residual_stride, int32_t* coeff_buffer, uint32_t coeff_stride, |
3940 | | TxSize transform_size, uint64_t* three_quad_energy, uint32_t bit_depth, |
3941 | | TxType transform_type, PlaneType component_type, TxCoeffShape trans_coeff_shape) |
3942 | | |
3943 | 1.22M | { |
3944 | 1.22M | (void)trans_coeff_shape; |
3945 | 1.22M | (void)coeff_stride; |
3946 | 1.22M | (void)component_type; |
3947 | | |
3948 | 1.22M | if (svt_av1_is_lossless_segment(pcs, ctx->blk_ptr->segment_id)) { |
3949 | 1.20M | assert(transform_type == DCT_DCT); |
3950 | 1.20M | int32_t dst[16]; |
3951 | | |
3952 | 1.20M | svt_av1_fwht4x4(residual_buffer, dst, residual_stride); |
3953 | 5.82M | for (int i = 0; i < 4; i++) { |
3954 | 22.8M | for (int j = 0; j < 4; j++) { |
3955 | 18.1M | coeff_buffer[(j << 2) + i] = dst[(i << 2) + j]; |
3956 | 18.1M | } |
3957 | 4.62M | } |
3958 | 1.20M | return EB_ErrorNone; |
3959 | 1.20M | } |
3960 | | |
3961 | 29.1k | switch (trans_coeff_shape) { |
3962 | 30.1k | case DEFAULT_SHAPE: |
3963 | 30.1k | return av1_estimate_transform_default(residual_buffer, |
3964 | 30.1k | residual_stride, |
3965 | 30.1k | coeff_buffer, |
3966 | 30.1k | coeff_stride, |
3967 | 30.1k | transform_size, |
3968 | 30.1k | three_quad_energy, |
3969 | 30.1k | bit_depth, |
3970 | 30.1k | transform_type, |
3971 | 30.1k | component_type); |
3972 | 0 | case N2_SHAPE: |
3973 | 0 | return av1_estimate_transform_N2(residual_buffer, |
3974 | 0 | residual_stride, |
3975 | 0 | coeff_buffer, |
3976 | 0 | coeff_stride, |
3977 | 0 | transform_size, |
3978 | 0 | three_quad_energy, |
3979 | 0 | bit_depth, |
3980 | 0 | transform_type, |
3981 | 0 | component_type); |
3982 | 0 | case N4_SHAPE: |
3983 | 0 | return av1_estimate_transform_N4(residual_buffer, |
3984 | 0 | residual_stride, |
3985 | 0 | coeff_buffer, |
3986 | 0 | coeff_stride, |
3987 | 0 | transform_size, |
3988 | 0 | three_quad_energy, |
3989 | 0 | bit_depth, |
3990 | 0 | transform_type, |
3991 | 0 | component_type); |
3992 | 0 | case ONLY_DC_SHAPE: |
3993 | 0 | return av1_estimate_transform_ONLY_DC(residual_buffer, |
3994 | 0 | residual_stride, |
3995 | 0 | coeff_buffer, |
3996 | 0 | coeff_stride, |
3997 | 0 | transform_size, |
3998 | 0 | three_quad_energy, |
3999 | 0 | bit_depth, |
4000 | 0 | transform_type, |
4001 | 0 | component_type); |
4002 | 29.1k | } |
4003 | | |
4004 | 29.1k | assert(0); |
4005 | 0 | return EB_ErrorBadParameter; |
4006 | 29.1k | } |
4007 | | |
4008 | | // PF_N4 |
4009 | 0 | static void highbd_fwd_txfm_64x64_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4010 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4011 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4012 | 0 | const int bd = txfm_param->bd; |
4013 | 0 | svt_av1_fwd_txfm2d_64x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4014 | 0 | } |
4015 | | |
4016 | 0 | static void highbd_fwd_txfm_32x64_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4017 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4018 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4019 | 0 | const int bd = txfm_param->bd; |
4020 | 0 | svt_av1_fwd_txfm2d_32x64_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); |
4021 | 0 | } |
4022 | | |
4023 | 0 | static void highbd_fwd_txfm_64x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4024 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4025 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4026 | 0 | const int bd = txfm_param->bd; |
4027 | 0 | svt_av1_fwd_txfm2d_64x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); |
4028 | 0 | } |
4029 | | |
4030 | 0 | static void highbd_fwd_txfm_16x64_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4031 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4032 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4033 | 0 | const int bd = txfm_param->bd; |
4034 | 0 | svt_av1_fwd_txfm2d_16x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4035 | 0 | } |
4036 | | |
4037 | 0 | static void highbd_fwd_txfm_64x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4038 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4039 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4040 | 0 | const int bd = txfm_param->bd; |
4041 | 0 | svt_av1_fwd_txfm2d_64x16_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4042 | 0 | } |
4043 | | |
4044 | 0 | static void highbd_fwd_txfm_32x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4045 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4046 | 0 | const TxType tx_type = txfm_param->tx_type; |
4047 | 0 | const int bd = txfm_param->bd; |
4048 | 0 | svt_av1_fwd_txfm2d_32x32_N4(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4049 | 0 | } |
4050 | | |
4051 | 0 | static void highbd_fwd_txfm_16x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4052 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4053 | 0 | const TxType tx_type = txfm_param->tx_type; |
4054 | 0 | const int bd = txfm_param->bd; |
4055 | 0 | svt_av1_fwd_txfm2d_16x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4056 | 0 | } |
4057 | | |
4058 | 0 | static void highbd_fwd_txfm_8x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4059 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4060 | 0 | const TxType tx_type = txfm_param->tx_type; |
4061 | 0 | const int bd = txfm_param->bd; |
4062 | 0 | svt_av1_fwd_txfm2d_8x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4063 | 0 | } |
4064 | | |
4065 | 0 | static void highbd_fwd_txfm_4x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4066 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4067 | 0 | svt_av1_fwd_txfm2d_4x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4068 | 0 | } |
4069 | | |
4070 | 0 | static void highbd_fwd_txfm_8x4_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4071 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4072 | 0 | svt_av1_fwd_txfm2d_8x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4073 | 0 | } |
4074 | | |
4075 | 0 | static void highbd_fwd_txfm_8x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4076 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4077 | 0 | const TxType tx_type = txfm_param->tx_type; |
4078 | 0 | const int bd = txfm_param->bd; |
4079 | 0 | svt_av1_fwd_txfm2d_8x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4080 | 0 | } |
4081 | | |
4082 | 0 | static void highbd_fwd_txfm_16x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4083 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4084 | 0 | const TxType tx_type = txfm_param->tx_type; |
4085 | 0 | const int bd = txfm_param->bd; |
4086 | 0 | svt_av1_fwd_txfm2d_16x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4087 | 0 | } |
4088 | | |
4089 | 0 | static void highbd_fwd_txfm_16x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4090 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4091 | 0 | svt_av1_fwd_txfm2d_16x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4092 | 0 | } |
4093 | | |
4094 | 0 | static void highbd_fwd_txfm_32x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4095 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4096 | 0 | svt_av1_fwd_txfm2d_32x16_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4097 | 0 | } |
4098 | | |
4099 | 0 | static void highbd_fwd_txfm_4x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4100 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4101 | 0 | svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4102 | 0 | } |
4103 | | |
4104 | 0 | static void highbd_fwd_txfm_16x4_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4105 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4106 | 0 | svt_av1_fwd_txfm2d_16x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4107 | 0 | } |
4108 | | |
4109 | 0 | static void highbd_fwd_txfm_8x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4110 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4111 | 0 | svt_av1_fwd_txfm2d_8x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4112 | 0 | } |
4113 | | |
4114 | 0 | static void highbd_fwd_txfm_32x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4115 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4116 | 0 | svt_av1_fwd_txfm2d_32x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4117 | 0 | } |
4118 | | |
4119 | | //PF_N2 |
4120 | 0 | static void highbd_fwd_txfm_64x64_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4121 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4122 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4123 | 0 | const int bd = txfm_param->bd; |
4124 | 0 | svt_av1_fwd_txfm2d_64x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4125 | 0 | } |
4126 | | |
4127 | 0 | static void highbd_fwd_txfm_32x64_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4128 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4129 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4130 | 0 | const int bd = txfm_param->bd; |
4131 | 0 | svt_av1_fwd_txfm2d_32x64_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); |
4132 | 0 | } |
4133 | | |
4134 | 0 | static void highbd_fwd_txfm_64x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4135 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4136 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4137 | 0 | const int bd = txfm_param->bd; |
4138 | 0 | svt_av1_fwd_txfm2d_64x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); |
4139 | 0 | } |
4140 | | |
4141 | 0 | static void highbd_fwd_txfm_16x64_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4142 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4143 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4144 | 0 | const int bd = txfm_param->bd; |
4145 | 0 | svt_av1_fwd_txfm2d_16x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4146 | 0 | } |
4147 | | |
4148 | 0 | static void highbd_fwd_txfm_64x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4149 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4150 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4151 | 0 | const int bd = txfm_param->bd; |
4152 | 0 | svt_av1_fwd_txfm2d_64x16_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4153 | 0 | } |
4154 | | |
4155 | 0 | static void highbd_fwd_txfm_32x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4156 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4157 | 0 | const TxType tx_type = txfm_param->tx_type; |
4158 | 0 | const int bd = txfm_param->bd; |
4159 | 0 | svt_av1_fwd_txfm2d_32x32_N2(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4160 | 0 | } |
4161 | | |
4162 | 0 | static void highbd_fwd_txfm_16x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4163 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4164 | 0 | const TxType tx_type = txfm_param->tx_type; |
4165 | 0 | const int bd = txfm_param->bd; |
4166 | 0 | svt_av1_fwd_txfm2d_16x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4167 | 0 | } |
4168 | | |
4169 | 0 | static void highbd_fwd_txfm_8x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4170 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4171 | 0 | const TxType tx_type = txfm_param->tx_type; |
4172 | 0 | const int bd = txfm_param->bd; |
4173 | 0 | svt_av1_fwd_txfm2d_8x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4174 | 0 | } |
4175 | | |
4176 | 0 | static void highbd_fwd_txfm_4x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4177 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4178 | 0 | svt_av1_fwd_txfm2d_4x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4179 | 0 | } |
4180 | | |
4181 | 0 | static void highbd_fwd_txfm_8x4_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4182 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4183 | 0 | svt_av1_fwd_txfm2d_8x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4184 | 0 | } |
4185 | | |
4186 | 0 | static void highbd_fwd_txfm_8x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4187 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4188 | 0 | const TxType tx_type = txfm_param->tx_type; |
4189 | 0 | const int bd = txfm_param->bd; |
4190 | 0 | svt_av1_fwd_txfm2d_8x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4191 | 0 | } |
4192 | | |
4193 | 0 | static void highbd_fwd_txfm_16x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4194 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4195 | 0 | const TxType tx_type = txfm_param->tx_type; |
4196 | 0 | const int bd = txfm_param->bd; |
4197 | 0 | svt_av1_fwd_txfm2d_16x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4198 | 0 | } |
4199 | | |
4200 | 0 | static void highbd_fwd_txfm_16x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4201 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4202 | 0 | svt_av1_fwd_txfm2d_16x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4203 | 0 | } |
4204 | | |
4205 | 0 | static void highbd_fwd_txfm_32x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4206 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4207 | 0 | svt_av1_fwd_txfm2d_32x16_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4208 | 0 | } |
4209 | | |
4210 | 0 | static void highbd_fwd_txfm_4x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4211 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4212 | 0 | svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4213 | 0 | } |
4214 | | |
4215 | 0 | static void highbd_fwd_txfm_16x4_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4216 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4217 | 0 | svt_av1_fwd_txfm2d_16x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4218 | 0 | } |
4219 | | |
4220 | 0 | static void highbd_fwd_txfm_8x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4221 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4222 | 0 | svt_av1_fwd_txfm2d_8x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4223 | 0 | } |
4224 | | |
4225 | 0 | static void highbd_fwd_txfm_32x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4226 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4227 | 0 | svt_av1_fwd_txfm2d_32x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4228 | 0 | } |
4229 | | |
4230 | 0 | static void highbd_fwd_txfm_64x64(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4231 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4232 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4233 | 0 | const int bd = txfm_param->bd; |
4234 | 0 | svt_av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4235 | 0 | } |
4236 | | |
4237 | 0 | static void highbd_fwd_txfm_32x64(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4238 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4239 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4240 | 0 | const int bd = txfm_param->bd; |
4241 | 0 | svt_av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); |
4242 | 0 | } |
4243 | | |
4244 | 0 | static void highbd_fwd_txfm_64x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4245 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4246 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4247 | 0 | const int bd = txfm_param->bd; |
4248 | 0 | svt_av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); |
4249 | 0 | } |
4250 | | |
4251 | 0 | static void highbd_fwd_txfm_16x64(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4252 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4253 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4254 | 0 | const int bd = txfm_param->bd; |
4255 | 0 | svt_av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4256 | 0 | } |
4257 | | |
4258 | 0 | static void highbd_fwd_txfm_64x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4259 | 0 | assert(txfm_param->tx_type == DCT_DCT); |
4260 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4261 | 0 | const int bd = txfm_param->bd; |
4262 | 0 | svt_av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); |
4263 | 0 | } |
4264 | | |
4265 | 0 | static void highbd_fwd_txfm_32x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4266 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4267 | 0 | const TxType tx_type = txfm_param->tx_type; |
4268 | 0 | const int bd = txfm_param->bd; |
4269 | 0 | svt_av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4270 | 0 | } |
4271 | | |
4272 | 0 | static void highbd_fwd_txfm_16x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4273 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4274 | 0 | const TxType tx_type = txfm_param->tx_type; |
4275 | 0 | const int bd = txfm_param->bd; |
4276 | 0 | svt_av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4277 | 0 | } |
4278 | | |
4279 | 0 | static void highbd_fwd_txfm_8x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4280 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4281 | 0 | const TxType tx_type = txfm_param->tx_type; |
4282 | 0 | const int bd = txfm_param->bd; |
4283 | 0 | svt_av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4284 | 0 | } |
4285 | | |
4286 | 0 | static void highbd_fwd_txfm_4x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4287 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4288 | 0 | svt_av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4289 | 0 | } |
4290 | | |
4291 | 0 | static void highbd_fwd_txfm_8x4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4292 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4293 | 0 | svt_av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4294 | 0 | } |
4295 | | |
4296 | 0 | static void highbd_fwd_txfm_8x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4297 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4298 | 0 | const TxType tx_type = txfm_param->tx_type; |
4299 | 0 | const int bd = txfm_param->bd; |
4300 | 0 | svt_av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4301 | 0 | } |
4302 | | |
4303 | 0 | static void highbd_fwd_txfm_16x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4304 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4305 | 0 | const TxType tx_type = txfm_param->tx_type; |
4306 | 0 | const int bd = txfm_param->bd; |
4307 | 0 | svt_av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); |
4308 | 0 | } |
4309 | | |
4310 | 0 | static void highbd_fwd_txfm_16x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4311 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4312 | 0 | svt_av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4313 | 0 | } |
4314 | | |
4315 | 0 | static void highbd_fwd_txfm_32x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4316 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4317 | 0 | svt_av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4318 | 0 | } |
4319 | | |
4320 | 0 | static void highbd_fwd_txfm_4x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4321 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4322 | 0 | svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4323 | 0 | } |
4324 | | |
4325 | 0 | static void highbd_fwd_txfm_16x4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4326 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4327 | 0 | svt_av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4328 | 0 | } |
4329 | | |
4330 | 0 | static void highbd_fwd_txfm_8x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4331 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4332 | 0 | svt_av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4333 | 0 | } |
4334 | | |
4335 | 0 | static void highbd_fwd_txfm_32x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4336 | 0 | int32_t* dst_coeff = (int32_t*)coeff; |
4337 | 0 | svt_av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); |
4338 | 0 | } |
4339 | | |
4340 | 0 | void svt_av1_highbd_fwd_txfm_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4341 | 0 | assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); |
4342 | 0 | const TxSize tx_size = txfm_param->tx_size; |
4343 | 0 | switch (tx_size) { |
4344 | 0 | case TX_64X64: |
4345 | 0 | highbd_fwd_txfm_64x64_n4(src_diff, coeff, diff_stride, txfm_param); |
4346 | 0 | break; |
4347 | 0 | case TX_32X64: |
4348 | 0 | highbd_fwd_txfm_32x64_n4(src_diff, coeff, diff_stride, txfm_param); |
4349 | 0 | break; |
4350 | 0 | case TX_64X32: |
4351 | 0 | highbd_fwd_txfm_64x32_n4(src_diff, coeff, diff_stride, txfm_param); |
4352 | 0 | break; |
4353 | 0 | case TX_16X64: |
4354 | 0 | highbd_fwd_txfm_16x64_n4(src_diff, coeff, diff_stride, txfm_param); |
4355 | 0 | break; |
4356 | 0 | case TX_64X16: |
4357 | 0 | highbd_fwd_txfm_64x16_n4(src_diff, coeff, diff_stride, txfm_param); |
4358 | 0 | break; |
4359 | 0 | case TX_32X32: |
4360 | 0 | highbd_fwd_txfm_32x32_n4(src_diff, coeff, diff_stride, txfm_param); |
4361 | 0 | break; |
4362 | 0 | case TX_16X16: |
4363 | 0 | highbd_fwd_txfm_16x16_n4(src_diff, coeff, diff_stride, txfm_param); |
4364 | 0 | break; |
4365 | 0 | case TX_8X8: |
4366 | 0 | highbd_fwd_txfm_8x8_n4(src_diff, coeff, diff_stride, txfm_param); |
4367 | 0 | break; |
4368 | 0 | case TX_4X8: |
4369 | 0 | highbd_fwd_txfm_4x8_n4(src_diff, coeff, diff_stride, txfm_param); |
4370 | 0 | break; |
4371 | 0 | case TX_8X4: |
4372 | 0 | highbd_fwd_txfm_8x4_n4(src_diff, coeff, diff_stride, txfm_param); |
4373 | 0 | break; |
4374 | 0 | case TX_8X16: |
4375 | 0 | highbd_fwd_txfm_8x16_n4(src_diff, coeff, diff_stride, txfm_param); |
4376 | 0 | break; |
4377 | 0 | case TX_16X8: |
4378 | 0 | highbd_fwd_txfm_16x8_n4(src_diff, coeff, diff_stride, txfm_param); |
4379 | 0 | break; |
4380 | 0 | case TX_16X32: |
4381 | 0 | highbd_fwd_txfm_16x32_n4(src_diff, coeff, diff_stride, txfm_param); |
4382 | 0 | break; |
4383 | 0 | case TX_32X16: |
4384 | 0 | highbd_fwd_txfm_32x16_n4(src_diff, coeff, diff_stride, txfm_param); |
4385 | 0 | break; |
4386 | 0 | case TX_4X4: |
4387 | | //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); |
4388 | 0 | break; |
4389 | 0 | case TX_4X16: |
4390 | 0 | highbd_fwd_txfm_4x16_n4(src_diff, coeff, diff_stride, txfm_param); |
4391 | 0 | break; |
4392 | 0 | case TX_16X4: |
4393 | 0 | highbd_fwd_txfm_16x4_n4(src_diff, coeff, diff_stride, txfm_param); |
4394 | 0 | break; |
4395 | 0 | case TX_8X32: |
4396 | 0 | highbd_fwd_txfm_8x32_n4(src_diff, coeff, diff_stride, txfm_param); |
4397 | 0 | break; |
4398 | 0 | case TX_32X8: |
4399 | 0 | highbd_fwd_txfm_32x8_n4(src_diff, coeff, diff_stride, txfm_param); |
4400 | 0 | break; |
4401 | 0 | default: |
4402 | 0 | assert(0); |
4403 | 0 | break; |
4404 | 0 | } |
4405 | 0 | } |
4406 | | |
4407 | 0 | void svt_av1_highbd_fwd_txfm_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4408 | 0 | assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); |
4409 | 0 | const TxSize tx_size = txfm_param->tx_size; |
4410 | 0 | switch (tx_size) { |
4411 | 0 | case TX_64X64: |
4412 | 0 | highbd_fwd_txfm_64x64_n2(src_diff, coeff, diff_stride, txfm_param); |
4413 | 0 | break; |
4414 | 0 | case TX_32X64: |
4415 | 0 | highbd_fwd_txfm_32x64_n2(src_diff, coeff, diff_stride, txfm_param); |
4416 | 0 | break; |
4417 | 0 | case TX_64X32: |
4418 | 0 | highbd_fwd_txfm_64x32_n2(src_diff, coeff, diff_stride, txfm_param); |
4419 | 0 | break; |
4420 | 0 | case TX_16X64: |
4421 | 0 | highbd_fwd_txfm_16x64_n2(src_diff, coeff, diff_stride, txfm_param); |
4422 | 0 | break; |
4423 | 0 | case TX_64X16: |
4424 | 0 | highbd_fwd_txfm_64x16_n2(src_diff, coeff, diff_stride, txfm_param); |
4425 | 0 | break; |
4426 | 0 | case TX_32X32: |
4427 | 0 | highbd_fwd_txfm_32x32_n2(src_diff, coeff, diff_stride, txfm_param); |
4428 | 0 | break; |
4429 | 0 | case TX_16X16: |
4430 | 0 | highbd_fwd_txfm_16x16_n2(src_diff, coeff, diff_stride, txfm_param); |
4431 | 0 | break; |
4432 | 0 | case TX_8X8: |
4433 | 0 | highbd_fwd_txfm_8x8_n2(src_diff, coeff, diff_stride, txfm_param); |
4434 | 0 | break; |
4435 | 0 | case TX_4X8: |
4436 | 0 | highbd_fwd_txfm_4x8_n2(src_diff, coeff, diff_stride, txfm_param); |
4437 | 0 | break; |
4438 | 0 | case TX_8X4: |
4439 | 0 | highbd_fwd_txfm_8x4_n2(src_diff, coeff, diff_stride, txfm_param); |
4440 | 0 | break; |
4441 | 0 | case TX_8X16: |
4442 | 0 | highbd_fwd_txfm_8x16_n2(src_diff, coeff, diff_stride, txfm_param); |
4443 | 0 | break; |
4444 | 0 | case TX_16X8: |
4445 | 0 | highbd_fwd_txfm_16x8_n2(src_diff, coeff, diff_stride, txfm_param); |
4446 | 0 | break; |
4447 | 0 | case TX_16X32: |
4448 | 0 | highbd_fwd_txfm_16x32_n2(src_diff, coeff, diff_stride, txfm_param); |
4449 | 0 | break; |
4450 | 0 | case TX_32X16: |
4451 | 0 | highbd_fwd_txfm_32x16_n2(src_diff, coeff, diff_stride, txfm_param); |
4452 | 0 | break; |
4453 | 0 | case TX_4X4: |
4454 | | //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); |
4455 | 0 | break; |
4456 | 0 | case TX_4X16: |
4457 | 0 | highbd_fwd_txfm_4x16_n2(src_diff, coeff, diff_stride, txfm_param); |
4458 | 0 | break; |
4459 | 0 | case TX_16X4: |
4460 | 0 | highbd_fwd_txfm_16x4_n2(src_diff, coeff, diff_stride, txfm_param); |
4461 | 0 | break; |
4462 | 0 | case TX_8X32: |
4463 | 0 | highbd_fwd_txfm_8x32_n2(src_diff, coeff, diff_stride, txfm_param); |
4464 | 0 | break; |
4465 | 0 | case TX_32X8: |
4466 | 0 | highbd_fwd_txfm_32x8_n2(src_diff, coeff, diff_stride, txfm_param); |
4467 | 0 | break; |
4468 | 0 | default: |
4469 | 0 | assert(0); |
4470 | 0 | break; |
4471 | 0 | } |
4472 | 0 | } |
4473 | | |
4474 | 0 | void svt_av1_highbd_fwd_txfm(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) { |
4475 | 0 | assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); |
4476 | 0 | const TxSize tx_size = txfm_param->tx_size; |
4477 | 0 | switch (tx_size) { |
4478 | 0 | case TX_64X64: |
4479 | 0 | highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); |
4480 | 0 | break; |
4481 | 0 | case TX_32X64: |
4482 | 0 | highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); |
4483 | 0 | break; |
4484 | 0 | case TX_64X32: |
4485 | 0 | highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); |
4486 | 0 | break; |
4487 | 0 | case TX_16X64: |
4488 | 0 | highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); |
4489 | 0 | break; |
4490 | 0 | case TX_64X16: |
4491 | 0 | highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); |
4492 | 0 | break; |
4493 | 0 | case TX_32X32: |
4494 | 0 | highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); |
4495 | 0 | break; |
4496 | 0 | case TX_16X16: |
4497 | 0 | highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); |
4498 | 0 | break; |
4499 | 0 | case TX_8X8: |
4500 | 0 | highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); |
4501 | 0 | break; |
4502 | 0 | case TX_4X8: |
4503 | 0 | highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); |
4504 | 0 | break; |
4505 | 0 | case TX_8X4: |
4506 | 0 | highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); |
4507 | 0 | break; |
4508 | 0 | case TX_8X16: |
4509 | 0 | highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); |
4510 | 0 | break; |
4511 | 0 | case TX_16X8: |
4512 | 0 | highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); |
4513 | 0 | break; |
4514 | 0 | case TX_16X32: |
4515 | 0 | highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); |
4516 | 0 | break; |
4517 | 0 | case TX_32X16: |
4518 | 0 | highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); |
4519 | 0 | break; |
4520 | 0 | case TX_4X4: |
4521 | | //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); |
4522 | 0 | break; |
4523 | 0 | case TX_4X16: |
4524 | 0 | highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); |
4525 | 0 | break; |
4526 | 0 | case TX_16X4: |
4527 | 0 | highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); |
4528 | 0 | break; |
4529 | 0 | case TX_8X32: |
4530 | 0 | highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); |
4531 | 0 | break; |
4532 | 0 | case TX_32X8: |
4533 | 0 | highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); |
4534 | 0 | break; |
4535 | 0 | default: |
4536 | 0 | assert(0); |
4537 | 0 | break; |
4538 | 0 | } |
4539 | 0 | } |
4540 | | |
4541 | | void svt_av1_wht_fwd_txfm(int16_t* src_diff, int bw, int32_t* coeff, TxSize tx_size, TxCoeffShape pf_shape, |
4542 | 0 | int bit_depth, int is_hbd) { |
4543 | 0 | TxfmParam txfm_param; |
4544 | 0 | txfm_param.tx_type = DCT_DCT; |
4545 | 0 | txfm_param.tx_size = tx_size; |
4546 | 0 | txfm_param.lossless = 0; |
4547 | 0 | txfm_param.tx_set_type = EXT_TX_SET_ALL16; |
4548 | |
|
4549 | 0 | txfm_param.bd = bit_depth; |
4550 | 0 | txfm_param.is_hbd = is_hbd; |
4551 | 0 | switch (pf_shape) { |
4552 | 0 | case N4_SHAPE: |
4553 | 0 | svt_av1_highbd_fwd_txfm_n4(src_diff, coeff, bw, &txfm_param); |
4554 | 0 | break; |
4555 | 0 | case N2_SHAPE: |
4556 | 0 | svt_av1_highbd_fwd_txfm_n2(src_diff, coeff, bw, &txfm_param); |
4557 | 0 | break; |
4558 | 0 | default: |
4559 | 0 | svt_av1_highbd_fwd_txfm(src_diff, coeff, bw, &txfm_param); |
4560 | 0 | } |
4561 | 0 | } |
4562 | | |
4563 | 0 | void svt_av1_fidentity16_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
4564 | 0 | (void)stage_range; |
4565 | 0 | (void)cos_bit; |
4566 | 0 | for (int32_t i = 0; i < 8; ++i) { |
4567 | 0 | output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits); |
4568 | 0 | } |
4569 | |
|
4570 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
4571 | 0 | } |
4572 | | |
4573 | 0 | void svt_av1_fadst16_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
4574 | 0 | (void)stage_range; |
4575 | 0 | const int32_t* cospi; |
4576 | |
|
4577 | 0 | int32_t *bf0, *bf1; |
4578 | 0 | int32_t step[16]; |
4579 | | |
4580 | | // stage 0; |
4581 | | |
4582 | | // stage 1; |
4583 | 0 | assert(output != input); |
4584 | 0 | bf1 = output; |
4585 | 0 | bf1[0] = input[0]; |
4586 | 0 | bf1[1] = -input[15]; |
4587 | 0 | bf1[2] = -input[7]; |
4588 | 0 | bf1[3] = input[8]; |
4589 | 0 | bf1[4] = -input[3]; |
4590 | 0 | bf1[5] = input[12]; |
4591 | 0 | bf1[6] = input[4]; |
4592 | 0 | bf1[7] = -input[11]; |
4593 | 0 | bf1[8] = -input[1]; |
4594 | 0 | bf1[9] = input[14]; |
4595 | 0 | bf1[10] = input[6]; |
4596 | 0 | bf1[11] = -input[9]; |
4597 | 0 | bf1[12] = input[2]; |
4598 | 0 | bf1[13] = -input[13]; |
4599 | 0 | bf1[14] = -input[5]; |
4600 | 0 | bf1[15] = input[10]; |
4601 | | |
4602 | | // stage 2 |
4603 | 0 | cospi = cospi_arr(cos_bit); |
4604 | 0 | bf0 = output; |
4605 | 0 | bf1 = step; |
4606 | 0 | bf1[0] = bf0[0]; |
4607 | 0 | bf1[1] = bf0[1]; |
4608 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
4609 | 0 | bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); |
4610 | 0 | bf1[4] = bf0[4]; |
4611 | 0 | bf1[5] = bf0[5]; |
4612 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
4613 | 0 | bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); |
4614 | 0 | bf1[8] = bf0[8]; |
4615 | 0 | bf1[9] = bf0[9]; |
4616 | 0 | bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); |
4617 | 0 | bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); |
4618 | 0 | bf1[12] = bf0[12]; |
4619 | 0 | bf1[13] = bf0[13]; |
4620 | 0 | bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); |
4621 | 0 | bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); |
4622 | | |
4623 | | // stage 3 |
4624 | 0 | bf0 = step; |
4625 | 0 | bf1 = output; |
4626 | 0 | bf1[0] = bf0[0] + bf0[2]; |
4627 | 0 | bf1[1] = bf0[1] + bf0[3]; |
4628 | 0 | bf1[2] = bf0[0] - bf0[2]; |
4629 | 0 | bf1[3] = bf0[1] - bf0[3]; |
4630 | 0 | bf1[4] = bf0[4] + bf0[6]; |
4631 | 0 | bf1[5] = bf0[5] + bf0[7]; |
4632 | 0 | bf1[6] = bf0[4] - bf0[6]; |
4633 | 0 | bf1[7] = bf0[5] - bf0[7]; |
4634 | 0 | bf1[8] = bf0[8] + bf0[10]; |
4635 | 0 | bf1[9] = bf0[9] + bf0[11]; |
4636 | 0 | bf1[10] = bf0[8] - bf0[10]; |
4637 | 0 | bf1[11] = bf0[9] - bf0[11]; |
4638 | 0 | bf1[12] = bf0[12] + bf0[14]; |
4639 | 0 | bf1[13] = bf0[13] + bf0[15]; |
4640 | 0 | bf1[14] = bf0[12] - bf0[14]; |
4641 | 0 | bf1[15] = bf0[13] - bf0[15]; |
4642 | | |
4643 | | // stage 4 |
4644 | 0 | cospi = cospi_arr(cos_bit); |
4645 | 0 | bf0 = output; |
4646 | 0 | bf1 = step; |
4647 | 0 | bf1[0] = bf0[0]; |
4648 | 0 | bf1[1] = bf0[1]; |
4649 | 0 | bf1[2] = bf0[2]; |
4650 | 0 | bf1[3] = bf0[3]; |
4651 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
4652 | 0 | bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); |
4653 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
4654 | 0 | bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); |
4655 | 0 | bf1[8] = bf0[8]; |
4656 | 0 | bf1[9] = bf0[9]; |
4657 | 0 | bf1[10] = bf0[10]; |
4658 | 0 | bf1[11] = bf0[11]; |
4659 | 0 | bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); |
4660 | 0 | bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); |
4661 | 0 | bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); |
4662 | 0 | bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); |
4663 | | |
4664 | | // stage 5 |
4665 | 0 | bf0 = step; |
4666 | 0 | bf1 = output; |
4667 | 0 | bf1[0] = bf0[0] + bf0[4]; |
4668 | 0 | bf1[1] = bf0[1] + bf0[5]; |
4669 | 0 | bf1[2] = bf0[2] + bf0[6]; |
4670 | 0 | bf1[3] = bf0[3] + bf0[7]; |
4671 | 0 | bf1[4] = bf0[0] - bf0[4]; |
4672 | 0 | bf1[5] = bf0[1] - bf0[5]; |
4673 | 0 | bf1[6] = bf0[2] - bf0[6]; |
4674 | 0 | bf1[7] = bf0[3] - bf0[7]; |
4675 | 0 | bf1[8] = bf0[8] + bf0[12]; |
4676 | 0 | bf1[9] = bf0[9] + bf0[13]; |
4677 | 0 | bf1[10] = bf0[10] + bf0[14]; |
4678 | 0 | bf1[11] = bf0[11] + bf0[15]; |
4679 | 0 | bf1[12] = bf0[8] - bf0[12]; |
4680 | 0 | bf1[13] = bf0[9] - bf0[13]; |
4681 | 0 | bf1[14] = bf0[10] - bf0[14]; |
4682 | 0 | bf1[15] = bf0[11] - bf0[15]; |
4683 | | |
4684 | | // stage 6 |
4685 | 0 | cospi = cospi_arr(cos_bit); |
4686 | 0 | bf0 = output; |
4687 | 0 | bf1 = step; |
4688 | 0 | bf1[0] = bf0[0]; |
4689 | 0 | bf1[1] = bf0[1]; |
4690 | 0 | bf1[2] = bf0[2]; |
4691 | 0 | bf1[3] = bf0[3]; |
4692 | 0 | bf1[4] = bf0[4]; |
4693 | 0 | bf1[5] = bf0[5]; |
4694 | 0 | bf1[6] = bf0[6]; |
4695 | 0 | bf1[7] = bf0[7]; |
4696 | 0 | bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); |
4697 | 0 | bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); |
4698 | 0 | bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); |
4699 | 0 | bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); |
4700 | 0 | bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); |
4701 | 0 | bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); |
4702 | 0 | bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); |
4703 | 0 | bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); |
4704 | | |
4705 | | // stage 7 |
4706 | 0 | bf0 = step; |
4707 | 0 | bf1 = output; |
4708 | 0 | bf1[0] = bf0[0] + bf0[8]; |
4709 | 0 | bf1[1] = bf0[1] + bf0[9]; |
4710 | 0 | bf1[2] = bf0[2] + bf0[10]; |
4711 | 0 | bf1[3] = bf0[3] + bf0[11]; |
4712 | 0 | bf1[4] = bf0[4] + bf0[12]; |
4713 | 0 | bf1[5] = bf0[5] + bf0[13]; |
4714 | 0 | bf1[6] = bf0[6] + bf0[14]; |
4715 | 0 | bf1[7] = bf0[7] + bf0[15]; |
4716 | 0 | bf1[8] = bf0[0] - bf0[8]; |
4717 | 0 | bf1[9] = bf0[1] - bf0[9]; |
4718 | 0 | bf1[10] = bf0[2] - bf0[10]; |
4719 | 0 | bf1[11] = bf0[3] - bf0[11]; |
4720 | 0 | bf1[12] = bf0[4] - bf0[12]; |
4721 | 0 | bf1[13] = bf0[5] - bf0[13]; |
4722 | 0 | bf1[14] = bf0[6] - bf0[14]; |
4723 | 0 | bf1[15] = bf0[7] - bf0[15]; |
4724 | | |
4725 | | // stage 8 |
4726 | 0 | cospi = cospi_arr(cos_bit); |
4727 | 0 | bf0 = output; |
4728 | 0 | bf1 = step; |
4729 | 0 | bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); |
4730 | 0 | bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); |
4731 | 0 | bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); |
4732 | 0 | bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); |
4733 | 0 | bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); |
4734 | 0 | bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); |
4735 | 0 | bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); |
4736 | 0 | bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); |
4737 | | |
4738 | | // stage 9 |
4739 | 0 | bf0 = step; |
4740 | 0 | bf1 = output; |
4741 | 0 | bf1[0] = bf0[1]; |
4742 | 0 | bf1[1] = bf0[14]; |
4743 | 0 | bf1[2] = bf0[3]; |
4744 | 0 | bf1[3] = bf0[12]; |
4745 | 0 | bf1[4] = bf0[5]; |
4746 | 0 | bf1[5] = bf0[10]; |
4747 | 0 | bf1[6] = bf0[7]; |
4748 | 0 | bf1[7] = bf0[8]; |
4749 | 0 | } |
4750 | | |
4751 | 0 | void svt_av1_fdct16_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
4752 | 0 | (void)stage_range; |
4753 | 0 | const int32_t* cospi; |
4754 | |
|
4755 | 0 | int32_t *bf0, *bf1; |
4756 | 0 | int32_t step[16]; |
4757 | | |
4758 | | // stage 0; |
4759 | | |
4760 | | // stage 1; |
4761 | 0 | bf1 = output; |
4762 | 0 | bf1[0] = input[0] + input[15]; |
4763 | 0 | bf1[1] = input[1] + input[14]; |
4764 | 0 | bf1[2] = input[2] + input[13]; |
4765 | 0 | bf1[3] = input[3] + input[12]; |
4766 | 0 | bf1[4] = input[4] + input[11]; |
4767 | 0 | bf1[5] = input[5] + input[10]; |
4768 | 0 | bf1[6] = input[6] + input[9]; |
4769 | 0 | bf1[7] = input[7] + input[8]; |
4770 | 0 | bf1[8] = -input[8] + input[7]; |
4771 | 0 | bf1[9] = -input[9] + input[6]; |
4772 | 0 | bf1[10] = -input[10] + input[5]; |
4773 | 0 | bf1[11] = -input[11] + input[4]; |
4774 | 0 | bf1[12] = -input[12] + input[3]; |
4775 | 0 | bf1[13] = -input[13] + input[2]; |
4776 | 0 | bf1[14] = -input[14] + input[1]; |
4777 | 0 | bf1[15] = -input[15] + input[0]; |
4778 | | |
4779 | | // stage 2 |
4780 | 0 | cospi = cospi_arr(cos_bit); |
4781 | 0 | bf0 = output; |
4782 | 0 | bf1 = step; |
4783 | 0 | bf1[0] = bf0[0] + bf0[7]; |
4784 | 0 | bf1[1] = bf0[1] + bf0[6]; |
4785 | 0 | bf1[2] = bf0[2] + bf0[5]; |
4786 | 0 | bf1[3] = bf0[3] + bf0[4]; |
4787 | 0 | bf1[4] = -bf0[4] + bf0[3]; |
4788 | 0 | bf1[5] = -bf0[5] + bf0[2]; |
4789 | 0 | bf1[6] = -bf0[6] + bf0[1]; |
4790 | 0 | bf1[7] = -bf0[7] + bf0[0]; |
4791 | 0 | bf1[8] = bf0[8]; |
4792 | 0 | bf1[9] = bf0[9]; |
4793 | 0 | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
4794 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
4795 | 0 | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
4796 | 0 | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
4797 | 0 | bf1[14] = bf0[14]; |
4798 | 0 | bf1[15] = bf0[15]; |
4799 | | |
4800 | | // stage 3 |
4801 | 0 | cospi = cospi_arr(cos_bit); |
4802 | 0 | bf0 = step; |
4803 | 0 | bf1 = output; |
4804 | 0 | bf1[0] = bf0[0] + bf0[3]; |
4805 | 0 | bf1[1] = bf0[1] + bf0[2]; |
4806 | 0 | bf1[2] = -bf0[2] + bf0[1]; |
4807 | 0 | bf1[3] = -bf0[3] + bf0[0]; |
4808 | 0 | bf1[4] = bf0[4]; |
4809 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
4810 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
4811 | 0 | bf1[7] = bf0[7]; |
4812 | 0 | bf1[8] = bf0[8] + bf0[11]; |
4813 | 0 | bf1[9] = bf0[9] + bf0[10]; |
4814 | 0 | bf1[10] = -bf0[10] + bf0[9]; |
4815 | 0 | bf1[11] = -bf0[11] + bf0[8]; |
4816 | 0 | bf1[12] = -bf0[12] + bf0[15]; |
4817 | 0 | bf1[13] = -bf0[13] + bf0[14]; |
4818 | 0 | bf1[14] = bf0[14] + bf0[13]; |
4819 | 0 | bf1[15] = bf0[15] + bf0[12]; |
4820 | | |
4821 | | // stage 4 |
4822 | 0 | cospi = cospi_arr(cos_bit); |
4823 | 0 | bf0 = output; |
4824 | 0 | bf1 = step; |
4825 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
4826 | 0 | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
4827 | 0 | bf1[4] = bf0[4] + bf0[5]; |
4828 | 0 | bf1[5] = -bf0[5] + bf0[4]; |
4829 | 0 | bf1[6] = -bf0[6] + bf0[7]; |
4830 | 0 | bf1[7] = bf0[7] + bf0[6]; |
4831 | 0 | bf1[8] = bf0[8]; |
4832 | 0 | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
4833 | 0 | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
4834 | 0 | bf1[11] = bf0[11]; |
4835 | 0 | bf1[12] = bf0[12]; |
4836 | 0 | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
4837 | 0 | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
4838 | 0 | bf1[15] = bf0[15]; |
4839 | | |
4840 | | // stage 5 |
4841 | 0 | cospi = cospi_arr(cos_bit); |
4842 | 0 | bf0 = step; |
4843 | 0 | bf1 = output; |
4844 | 0 | bf1[0] = bf0[0]; |
4845 | 0 | bf1[2] = bf0[2]; |
4846 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
4847 | 0 | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
4848 | 0 | bf1[8] = bf0[8] + bf0[9]; |
4849 | 0 | bf1[9] = -bf0[9] + bf0[8]; |
4850 | 0 | bf1[10] = -bf0[10] + bf0[11]; |
4851 | 0 | bf1[11] = bf0[11] + bf0[10]; |
4852 | 0 | bf1[12] = bf0[12] + bf0[13]; |
4853 | 0 | bf1[13] = -bf0[13] + bf0[12]; |
4854 | 0 | bf1[14] = -bf0[14] + bf0[15]; |
4855 | 0 | bf1[15] = bf0[15] + bf0[14]; |
4856 | | |
4857 | | // stage 6 |
4858 | 0 | cospi = cospi_arr(cos_bit); |
4859 | 0 | bf0 = output; |
4860 | 0 | bf1 = step; |
4861 | 0 | bf1[0] = bf0[0]; |
4862 | 0 | bf1[2] = bf0[2]; |
4863 | 0 | bf1[4] = bf0[4]; |
4864 | 0 | bf1[6] = bf0[6]; |
4865 | 0 | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
4866 | 0 | bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); |
4867 | 0 | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
4868 | 0 | bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); |
4869 | | |
4870 | | // stage 7 |
4871 | 0 | bf0 = step; |
4872 | 0 | bf1 = output; |
4873 | 0 | bf1[0] = bf0[0]; |
4874 | 0 | bf1[1] = bf0[8]; |
4875 | 0 | bf1[2] = bf0[4]; |
4876 | 0 | bf1[3] = bf0[12]; |
4877 | 0 | bf1[4] = bf0[2]; |
4878 | 0 | bf1[5] = bf0[10]; |
4879 | 0 | bf1[6] = bf0[6]; |
4880 | 0 | bf1[7] = bf0[14]; |
4881 | 0 | } |
4882 | | |
4883 | 0 | void svt_av1_fidentity8_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
4884 | 0 | (void)stage_range; |
4885 | 0 | (void)cos_bit; |
4886 | 0 | for (int32_t i = 0; i < 4; ++i) { |
4887 | 0 | output[i] = input[i] * 2; |
4888 | 0 | } |
4889 | 0 | } |
4890 | | |
4891 | 0 | void svt_av1_fadst8_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
4892 | 0 | (void)stage_range; |
4893 | 0 | const int32_t* cospi; |
4894 | |
|
4895 | 0 | int32_t *bf0, *bf1; |
4896 | 0 | int32_t step[8]; |
4897 | | |
4898 | | // stage 0; |
4899 | | |
4900 | | // stage 1; |
4901 | 0 | assert(output != input); |
4902 | 0 | bf1 = output; |
4903 | 0 | bf1[0] = input[0]; |
4904 | 0 | bf1[1] = -input[7]; |
4905 | 0 | bf1[2] = -input[3]; |
4906 | 0 | bf1[3] = input[4]; |
4907 | 0 | bf1[4] = -input[1]; |
4908 | 0 | bf1[5] = input[6]; |
4909 | 0 | bf1[6] = input[2]; |
4910 | 0 | bf1[7] = -input[5]; |
4911 | | |
4912 | | // stage 2 |
4913 | 0 | cospi = cospi_arr(cos_bit); |
4914 | 0 | bf0 = output; |
4915 | 0 | bf1 = step; |
4916 | 0 | bf1[0] = bf0[0]; |
4917 | 0 | bf1[1] = bf0[1]; |
4918 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
4919 | 0 | bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); |
4920 | 0 | bf1[4] = bf0[4]; |
4921 | 0 | bf1[5] = bf0[5]; |
4922 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
4923 | 0 | bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); |
4924 | | |
4925 | | // stage 3 |
4926 | 0 | bf0 = step; |
4927 | 0 | bf1 = output; |
4928 | 0 | bf1[0] = bf0[0] + bf0[2]; |
4929 | 0 | bf1[1] = bf0[1] + bf0[3]; |
4930 | 0 | bf1[2] = bf0[0] - bf0[2]; |
4931 | 0 | bf1[3] = bf0[1] - bf0[3]; |
4932 | 0 | bf1[4] = bf0[4] + bf0[6]; |
4933 | 0 | bf1[5] = bf0[5] + bf0[7]; |
4934 | 0 | bf1[6] = bf0[4] - bf0[6]; |
4935 | 0 | bf1[7] = bf0[5] - bf0[7]; |
4936 | | |
4937 | | // stage 4 |
4938 | 0 | cospi = cospi_arr(cos_bit); |
4939 | 0 | bf0 = output; |
4940 | 0 | bf1 = step; |
4941 | 0 | bf1[0] = bf0[0]; |
4942 | 0 | bf1[1] = bf0[1]; |
4943 | 0 | bf1[2] = bf0[2]; |
4944 | 0 | bf1[3] = bf0[3]; |
4945 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
4946 | 0 | bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); |
4947 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
4948 | 0 | bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); |
4949 | | |
4950 | | // stage 5 |
4951 | 0 | bf0 = step; |
4952 | 0 | bf1 = output; |
4953 | 0 | bf1[0] = bf0[0] + bf0[4]; |
4954 | 0 | bf1[1] = bf0[1] + bf0[5]; |
4955 | 0 | bf1[2] = bf0[2] + bf0[6]; |
4956 | 0 | bf1[3] = bf0[3] + bf0[7]; |
4957 | 0 | bf1[4] = bf0[0] - bf0[4]; |
4958 | 0 | bf1[5] = bf0[1] - bf0[5]; |
4959 | 0 | bf1[6] = bf0[2] - bf0[6]; |
4960 | 0 | bf1[7] = bf0[3] - bf0[7]; |
4961 | | |
4962 | | // stage 6 |
4963 | 0 | cospi = cospi_arr(cos_bit); |
4964 | 0 | bf0 = output; |
4965 | 0 | bf1 = step; |
4966 | 0 | bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); |
4967 | 0 | bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); |
4968 | 0 | bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); |
4969 | 0 | bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); |
4970 | | |
4971 | | // stage 7 |
4972 | 0 | bf0 = step; |
4973 | 0 | bf1 = output; |
4974 | 0 | bf1[0] = bf0[1]; |
4975 | 0 | bf1[1] = bf0[6]; |
4976 | 0 | bf1[2] = bf0[3]; |
4977 | 0 | bf1[3] = bf0[4]; |
4978 | 0 | } |
4979 | | |
4980 | 0 | void svt_av1_fdct8_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
4981 | 0 | (void)stage_range; |
4982 | 0 | const int32_t* cospi; |
4983 | |
|
4984 | 0 | int32_t *bf0, *bf1; |
4985 | 0 | int32_t step[8]; |
4986 | | |
4987 | | // stage 0; |
4988 | | |
4989 | | // stage 1; |
4990 | 0 | bf1 = output; |
4991 | 0 | bf1[0] = input[0] + input[7]; |
4992 | 0 | bf1[1] = input[1] + input[6]; |
4993 | 0 | bf1[2] = input[2] + input[5]; |
4994 | 0 | bf1[3] = input[3] + input[4]; |
4995 | 0 | bf1[4] = -input[4] + input[3]; |
4996 | 0 | bf1[5] = -input[5] + input[2]; |
4997 | 0 | bf1[6] = -input[6] + input[1]; |
4998 | 0 | bf1[7] = -input[7] + input[0]; |
4999 | | |
5000 | | // stage 2 |
5001 | 0 | cospi = cospi_arr(cos_bit); |
5002 | 0 | bf0 = output; |
5003 | 0 | bf1 = step; |
5004 | 0 | bf1[0] = bf0[0] + bf0[3]; |
5005 | 0 | bf1[1] = bf0[1] + bf0[2]; |
5006 | 0 | bf1[2] = -bf0[2] + bf0[1]; |
5007 | 0 | bf1[3] = -bf0[3] + bf0[0]; |
5008 | 0 | bf1[4] = bf0[4]; |
5009 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
5010 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
5011 | 0 | bf1[7] = bf0[7]; |
5012 | | |
5013 | | // stage 3 |
5014 | 0 | cospi = cospi_arr(cos_bit); |
5015 | 0 | bf0 = step; |
5016 | 0 | bf1 = output; |
5017 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
5018 | 0 | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
5019 | 0 | bf1[4] = bf0[4] + bf0[5]; |
5020 | 0 | bf1[5] = -bf0[5] + bf0[4]; |
5021 | 0 | bf1[6] = -bf0[6] + bf0[7]; |
5022 | 0 | bf1[7] = bf0[7] + bf0[6]; |
5023 | | |
5024 | | // stage 4 |
5025 | 0 | cospi = cospi_arr(cos_bit); |
5026 | 0 | bf0 = output; |
5027 | 0 | bf1 = step; |
5028 | 0 | bf1[0] = bf0[0]; |
5029 | 0 | bf1[2] = bf0[2]; |
5030 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
5031 | 0 | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
5032 | | |
5033 | | // stage 5 |
5034 | 0 | bf0 = step; |
5035 | 0 | bf1 = output; |
5036 | 0 | bf1[0] = bf0[0]; |
5037 | 0 | bf1[1] = bf0[4]; |
5038 | 0 | bf1[2] = bf0[2]; |
5039 | 0 | bf1[3] = bf0[6]; |
5040 | 0 | } |
5041 | | |
5042 | 0 | void svt_av1_fidentity4_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
5043 | 0 | (void)stage_range; |
5044 | 0 | (void)cos_bit; |
5045 | 0 | output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits); |
5046 | 0 | output[1] = round_shift((int64_t)input[1] * new_sqrt2, new_sqrt2_bits); |
5047 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
5048 | 0 | } |
5049 | | |
5050 | 0 | void svt_av1_fadst4_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
5051 | 0 | (void)stage_range; |
5052 | 0 | int32_t bit = cos_bit; |
5053 | 0 | const int32_t* sinpi = sinpi_arr(bit); |
5054 | 0 | int32_t x0, x1, x2, x3; |
5055 | 0 | int32_t s0, s2, s4, s5, s7; |
5056 | | |
5057 | | // stage 0 |
5058 | 0 | x0 = input[0]; |
5059 | 0 | x1 = input[1]; |
5060 | 0 | x2 = input[2]; |
5061 | 0 | x3 = input[3]; |
5062 | |
|
5063 | 0 | if (!(x0 | x1 | x2 | x3)) { |
5064 | 0 | output[0] = output[1] = output[2] = output[3] = 0; |
5065 | 0 | return; |
5066 | 0 | } |
5067 | | |
5068 | | // stage 1 |
5069 | 0 | s0 = sinpi[1] * x0; |
5070 | 0 | s2 = sinpi[2] * x1; |
5071 | 0 | s4 = sinpi[3] * x2; |
5072 | 0 | s5 = sinpi[4] * x3; |
5073 | 0 | s7 = x0 + x1; |
5074 | | |
5075 | | // stage 2 |
5076 | 0 | s7 = s7 - x3; |
5077 | | |
5078 | | // stage 3 |
5079 | 0 | x0 = s0 + s2; |
5080 | 0 | x1 = sinpi[3] * s7; |
5081 | | |
5082 | | // stage 4 |
5083 | 0 | x0 = x0 + s5; |
5084 | | |
5085 | | // stage 5 |
5086 | 0 | s0 = x0 + s4; |
5087 | | |
5088 | | // 1-D transform scaling factor is sqrt(2). |
5089 | 0 | output[0] = round_shift(s0, bit); |
5090 | 0 | output[1] = round_shift(x1, bit); |
5091 | 0 | } |
5092 | | |
5093 | 0 | void svt_av1_fdct4_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
5094 | 0 | (void)stage_range; |
5095 | 0 | const int32_t* cospi; |
5096 | |
|
5097 | 0 | int32_t* bf0; |
5098 | 0 | int32_t step[4]; |
5099 | | |
5100 | | // stage 1; |
5101 | 0 | bf0 = step; |
5102 | 0 | bf0[0] = input[0] + input[3]; |
5103 | 0 | bf0[1] = input[1] + input[2]; |
5104 | 0 | bf0[2] = -input[2] + input[1]; |
5105 | 0 | bf0[3] = -input[3] + input[0]; |
5106 | | |
5107 | | // stage 2 |
5108 | 0 | cospi = cospi_arr(cos_bit); |
5109 | |
|
5110 | 0 | output[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
5111 | 0 | output[1] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
5112 | 0 | } |
5113 | | |
5114 | 0 | void svt_av1_fdct32_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
5115 | 0 | (void)stage_range; |
5116 | 0 | const int32_t* cospi; |
5117 | |
|
5118 | 0 | int32_t *bf0, *bf1; |
5119 | 0 | int32_t step[32]; |
5120 | | |
5121 | | // stage 0; |
5122 | | |
5123 | | // stage 1; |
5124 | 0 | bf1 = output; |
5125 | 0 | bf1[0] = input[0] + input[31]; |
5126 | 0 | bf1[1] = input[1] + input[30]; |
5127 | 0 | bf1[2] = input[2] + input[29]; |
5128 | 0 | bf1[3] = input[3] + input[28]; |
5129 | 0 | bf1[4] = input[4] + input[27]; |
5130 | 0 | bf1[5] = input[5] + input[26]; |
5131 | 0 | bf1[6] = input[6] + input[25]; |
5132 | 0 | bf1[7] = input[7] + input[24]; |
5133 | 0 | bf1[8] = input[8] + input[23]; |
5134 | 0 | bf1[9] = input[9] + input[22]; |
5135 | 0 | bf1[10] = input[10] + input[21]; |
5136 | 0 | bf1[11] = input[11] + input[20]; |
5137 | 0 | bf1[12] = input[12] + input[19]; |
5138 | 0 | bf1[13] = input[13] + input[18]; |
5139 | 0 | bf1[14] = input[14] + input[17]; |
5140 | 0 | bf1[15] = input[15] + input[16]; |
5141 | 0 | bf1[16] = -input[16] + input[15]; |
5142 | 0 | bf1[17] = -input[17] + input[14]; |
5143 | 0 | bf1[18] = -input[18] + input[13]; |
5144 | 0 | bf1[19] = -input[19] + input[12]; |
5145 | 0 | bf1[20] = -input[20] + input[11]; |
5146 | 0 | bf1[21] = -input[21] + input[10]; |
5147 | 0 | bf1[22] = -input[22] + input[9]; |
5148 | 0 | bf1[23] = -input[23] + input[8]; |
5149 | 0 | bf1[24] = -input[24] + input[7]; |
5150 | 0 | bf1[25] = -input[25] + input[6]; |
5151 | 0 | bf1[26] = -input[26] + input[5]; |
5152 | 0 | bf1[27] = -input[27] + input[4]; |
5153 | 0 | bf1[28] = -input[28] + input[3]; |
5154 | 0 | bf1[29] = -input[29] + input[2]; |
5155 | 0 | bf1[30] = -input[30] + input[1]; |
5156 | 0 | bf1[31] = -input[31] + input[0]; |
5157 | | |
5158 | | // stage 2 |
5159 | 0 | cospi = cospi_arr(cos_bit); |
5160 | 0 | bf0 = output; |
5161 | 0 | bf1 = step; |
5162 | 0 | bf1[0] = bf0[0] + bf0[15]; |
5163 | 0 | bf1[1] = bf0[1] + bf0[14]; |
5164 | 0 | bf1[2] = bf0[2] + bf0[13]; |
5165 | 0 | bf1[3] = bf0[3] + bf0[12]; |
5166 | 0 | bf1[4] = bf0[4] + bf0[11]; |
5167 | 0 | bf1[5] = bf0[5] + bf0[10]; |
5168 | 0 | bf1[6] = bf0[6] + bf0[9]; |
5169 | 0 | bf1[7] = bf0[7] + bf0[8]; |
5170 | 0 | bf1[8] = -bf0[8] + bf0[7]; |
5171 | 0 | bf1[9] = -bf0[9] + bf0[6]; |
5172 | 0 | bf1[10] = -bf0[10] + bf0[5]; |
5173 | 0 | bf1[11] = -bf0[11] + bf0[4]; |
5174 | 0 | bf1[12] = -bf0[12] + bf0[3]; |
5175 | 0 | bf1[13] = -bf0[13] + bf0[2]; |
5176 | 0 | bf1[14] = -bf0[14] + bf0[1]; |
5177 | 0 | bf1[15] = -bf0[15] + bf0[0]; |
5178 | 0 | bf1[16] = bf0[16]; |
5179 | 0 | bf1[17] = bf0[17]; |
5180 | 0 | bf1[18] = bf0[18]; |
5181 | 0 | bf1[19] = bf0[19]; |
5182 | 0 | bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); |
5183 | 0 | bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); |
5184 | 0 | bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); |
5185 | 0 | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); |
5186 | 0 | bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); |
5187 | 0 | bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); |
5188 | 0 | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); |
5189 | 0 | bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); |
5190 | 0 | bf1[28] = bf0[28]; |
5191 | 0 | bf1[29] = bf0[29]; |
5192 | 0 | bf1[30] = bf0[30]; |
5193 | 0 | bf1[31] = bf0[31]; |
5194 | | |
5195 | | // stage 3 |
5196 | 0 | cospi = cospi_arr(cos_bit); |
5197 | 0 | bf0 = step; |
5198 | 0 | bf1 = output; |
5199 | 0 | bf1[0] = bf0[0] + bf0[7]; |
5200 | 0 | bf1[1] = bf0[1] + bf0[6]; |
5201 | 0 | bf1[2] = bf0[2] + bf0[5]; |
5202 | 0 | bf1[3] = bf0[3] + bf0[4]; |
5203 | 0 | bf1[4] = -bf0[4] + bf0[3]; |
5204 | 0 | bf1[5] = -bf0[5] + bf0[2]; |
5205 | 0 | bf1[6] = -bf0[6] + bf0[1]; |
5206 | 0 | bf1[7] = -bf0[7] + bf0[0]; |
5207 | 0 | bf1[8] = bf0[8]; |
5208 | 0 | bf1[9] = bf0[9]; |
5209 | 0 | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
5210 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
5211 | 0 | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
5212 | 0 | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
5213 | 0 | bf1[14] = bf0[14]; |
5214 | 0 | bf1[15] = bf0[15]; |
5215 | 0 | bf1[16] = bf0[16] + bf0[23]; |
5216 | 0 | bf1[17] = bf0[17] + bf0[22]; |
5217 | 0 | bf1[18] = bf0[18] + bf0[21]; |
5218 | 0 | bf1[19] = bf0[19] + bf0[20]; |
5219 | 0 | bf1[20] = -bf0[20] + bf0[19]; |
5220 | 0 | bf1[21] = -bf0[21] + bf0[18]; |
5221 | 0 | bf1[22] = -bf0[22] + bf0[17]; |
5222 | 0 | bf1[23] = -bf0[23] + bf0[16]; |
5223 | 0 | bf1[24] = -bf0[24] + bf0[31]; |
5224 | 0 | bf1[25] = -bf0[25] + bf0[30]; |
5225 | 0 | bf1[26] = -bf0[26] + bf0[29]; |
5226 | 0 | bf1[27] = -bf0[27] + bf0[28]; |
5227 | 0 | bf1[28] = bf0[28] + bf0[27]; |
5228 | 0 | bf1[29] = bf0[29] + bf0[26]; |
5229 | 0 | bf1[30] = bf0[30] + bf0[25]; |
5230 | 0 | bf1[31] = bf0[31] + bf0[24]; |
5231 | | |
5232 | | // stage 4 |
5233 | 0 | cospi = cospi_arr(cos_bit); |
5234 | 0 | bf0 = output; |
5235 | 0 | bf1 = step; |
5236 | 0 | bf1[0] = bf0[0] + bf0[3]; |
5237 | 0 | bf1[1] = bf0[1] + bf0[2]; |
5238 | 0 | bf1[2] = -bf0[2] + bf0[1]; |
5239 | 0 | bf1[3] = -bf0[3] + bf0[0]; |
5240 | 0 | bf1[4] = bf0[4]; |
5241 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
5242 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
5243 | 0 | bf1[7] = bf0[7]; |
5244 | 0 | bf1[8] = bf0[8] + bf0[11]; |
5245 | 0 | bf1[9] = bf0[9] + bf0[10]; |
5246 | 0 | bf1[10] = -bf0[10] + bf0[9]; |
5247 | 0 | bf1[11] = -bf0[11] + bf0[8]; |
5248 | 0 | bf1[12] = -bf0[12] + bf0[15]; |
5249 | 0 | bf1[13] = -bf0[13] + bf0[14]; |
5250 | 0 | bf1[14] = bf0[14] + bf0[13]; |
5251 | 0 | bf1[15] = bf0[15] + bf0[12]; |
5252 | 0 | bf1[16] = bf0[16]; |
5253 | 0 | bf1[17] = bf0[17]; |
5254 | 0 | bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); |
5255 | 0 | bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); |
5256 | 0 | bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); |
5257 | 0 | bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); |
5258 | 0 | bf1[22] = bf0[22]; |
5259 | 0 | bf1[23] = bf0[23]; |
5260 | 0 | bf1[24] = bf0[24]; |
5261 | 0 | bf1[25] = bf0[25]; |
5262 | 0 | bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); |
5263 | 0 | bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); |
5264 | 0 | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); |
5265 | 0 | bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); |
5266 | 0 | bf1[30] = bf0[30]; |
5267 | 0 | bf1[31] = bf0[31]; |
5268 | | |
5269 | | // stage 5 |
5270 | 0 | cospi = cospi_arr(cos_bit); |
5271 | 0 | bf0 = step; |
5272 | 0 | bf1 = output; |
5273 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
5274 | 0 | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
5275 | 0 | bf1[4] = bf0[4] + bf0[5]; |
5276 | 0 | bf1[5] = -bf0[5] + bf0[4]; |
5277 | 0 | bf1[6] = -bf0[6] + bf0[7]; |
5278 | 0 | bf1[7] = bf0[7] + bf0[6]; |
5279 | 0 | bf1[8] = bf0[8]; |
5280 | 0 | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
5281 | 0 | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
5282 | 0 | bf1[11] = bf0[11]; |
5283 | 0 | bf1[12] = bf0[12]; |
5284 | 0 | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
5285 | 0 | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
5286 | 0 | bf1[15] = bf0[15]; |
5287 | 0 | bf1[16] = bf0[16] + bf0[19]; |
5288 | 0 | bf1[17] = bf0[17] + bf0[18]; |
5289 | 0 | bf1[18] = -bf0[18] + bf0[17]; |
5290 | 0 | bf1[19] = -bf0[19] + bf0[16]; |
5291 | 0 | bf1[20] = -bf0[20] + bf0[23]; |
5292 | 0 | bf1[21] = -bf0[21] + bf0[22]; |
5293 | 0 | bf1[22] = bf0[22] + bf0[21]; |
5294 | 0 | bf1[23] = bf0[23] + bf0[20]; |
5295 | 0 | bf1[24] = bf0[24] + bf0[27]; |
5296 | 0 | bf1[25] = bf0[25] + bf0[26]; |
5297 | 0 | bf1[26] = -bf0[26] + bf0[25]; |
5298 | 0 | bf1[27] = -bf0[27] + bf0[24]; |
5299 | 0 | bf1[28] = -bf0[28] + bf0[31]; |
5300 | 0 | bf1[29] = -bf0[29] + bf0[30]; |
5301 | 0 | bf1[30] = bf0[30] + bf0[29]; |
5302 | 0 | bf1[31] = bf0[31] + bf0[28]; |
5303 | | |
5304 | | // stage 6 |
5305 | 0 | cospi = cospi_arr(cos_bit); |
5306 | 0 | bf0 = output; |
5307 | 0 | bf1 = step; |
5308 | 0 | bf1[0] = bf0[0]; |
5309 | 0 | bf1[2] = bf0[2]; |
5310 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
5311 | 0 | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
5312 | 0 | bf1[8] = bf0[8] + bf0[9]; |
5313 | 0 | bf1[9] = -bf0[9] + bf0[8]; |
5314 | 0 | bf1[10] = -bf0[10] + bf0[11]; |
5315 | 0 | bf1[11] = bf0[11] + bf0[10]; |
5316 | 0 | bf1[12] = bf0[12] + bf0[13]; |
5317 | 0 | bf1[13] = -bf0[13] + bf0[12]; |
5318 | 0 | bf1[14] = -bf0[14] + bf0[15]; |
5319 | 0 | bf1[15] = bf0[15] + bf0[14]; |
5320 | 0 | bf1[16] = bf0[16]; |
5321 | 0 | bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); |
5322 | 0 | bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); |
5323 | 0 | bf1[19] = bf0[19]; |
5324 | 0 | bf1[20] = bf0[20]; |
5325 | 0 | bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); |
5326 | 0 | bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); |
5327 | 0 | bf1[23] = bf0[23]; |
5328 | 0 | bf1[24] = bf0[24]; |
5329 | 0 | bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); |
5330 | 0 | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); |
5331 | 0 | bf1[27] = bf0[27]; |
5332 | 0 | bf1[28] = bf0[28]; |
5333 | 0 | bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); |
5334 | 0 | bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); |
5335 | 0 | bf1[31] = bf0[31]; |
5336 | | |
5337 | | // stage 7 |
5338 | 0 | cospi = cospi_arr(cos_bit); |
5339 | 0 | bf0 = step; |
5340 | 0 | bf1 = output; |
5341 | 0 | bf1[0] = bf0[0]; |
5342 | 0 | bf1[2] = bf0[2]; |
5343 | 0 | bf1[4] = bf0[4]; |
5344 | 0 | bf1[6] = bf0[6]; |
5345 | 0 | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
5346 | 0 | bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); |
5347 | 0 | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
5348 | 0 | bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); |
5349 | 0 | bf1[16] = bf0[16] + bf0[17]; |
5350 | 0 | bf1[17] = -bf0[17] + bf0[16]; |
5351 | 0 | bf1[18] = -bf0[18] + bf0[19]; |
5352 | 0 | bf1[19] = bf0[19] + bf0[18]; |
5353 | 0 | bf1[20] = bf0[20] + bf0[21]; |
5354 | 0 | bf1[21] = -bf0[21] + bf0[20]; |
5355 | 0 | bf1[22] = -bf0[22] + bf0[23]; |
5356 | 0 | bf1[23] = bf0[23] + bf0[22]; |
5357 | 0 | bf1[24] = bf0[24] + bf0[25]; |
5358 | 0 | bf1[25] = -bf0[25] + bf0[24]; |
5359 | 0 | bf1[26] = -bf0[26] + bf0[27]; |
5360 | 0 | bf1[27] = bf0[27] + bf0[26]; |
5361 | 0 | bf1[28] = bf0[28] + bf0[29]; |
5362 | 0 | bf1[29] = -bf0[29] + bf0[28]; |
5363 | 0 | bf1[30] = -bf0[30] + bf0[31]; |
5364 | 0 | bf1[31] = bf0[31] + bf0[30]; |
5365 | | |
5366 | | // stage 8 |
5367 | 0 | cospi = cospi_arr(cos_bit); |
5368 | 0 | bf0 = output; |
5369 | 0 | bf1 = step; |
5370 | 0 | bf1[0] = bf0[0]; |
5371 | 0 | bf1[2] = bf0[2]; |
5372 | 0 | bf1[4] = bf0[4]; |
5373 | 0 | bf1[6] = bf0[6]; |
5374 | 0 | bf1[8] = bf0[8]; |
5375 | 0 | bf1[10] = bf0[10]; |
5376 | 0 | bf1[12] = bf0[12]; |
5377 | 0 | bf1[14] = bf0[14]; |
5378 | 0 | bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); |
5379 | 0 | bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); |
5380 | 0 | bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); |
5381 | 0 | bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); |
5382 | 0 | bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); |
5383 | 0 | bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); |
5384 | 0 | bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); |
5385 | 0 | bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); |
5386 | | |
5387 | | // stage 9 |
5388 | 0 | bf0 = step; |
5389 | 0 | bf1 = output; |
5390 | 0 | bf1[0] = bf0[0]; |
5391 | 0 | bf1[1] = bf0[16]; |
5392 | 0 | bf1[2] = bf0[8]; |
5393 | 0 | bf1[3] = bf0[24]; |
5394 | 0 | bf1[4] = bf0[4]; |
5395 | 0 | bf1[5] = bf0[20]; |
5396 | 0 | bf1[6] = bf0[12]; |
5397 | 0 | bf1[7] = bf0[28]; |
5398 | 0 | bf1[8] = bf0[2]; |
5399 | 0 | bf1[9] = bf0[18]; |
5400 | 0 | bf1[10] = bf0[10]; |
5401 | 0 | bf1[11] = bf0[26]; |
5402 | 0 | bf1[12] = bf0[6]; |
5403 | 0 | bf1[13] = bf0[22]; |
5404 | 0 | bf1[14] = bf0[14]; |
5405 | 0 | bf1[15] = bf0[30]; |
5406 | 0 | } |
5407 | | |
5408 | 0 | void svt_av1_fidentity32_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
5409 | 0 | (void)stage_range; |
5410 | 0 | (void)cos_bit; |
5411 | 0 | for (int32_t i = 0; i < 16; ++i) { |
5412 | 0 | output[i] = input[i] * 4; |
5413 | 0 | } |
5414 | 0 | } |
5415 | | |
5416 | 0 | void svt_av1_fdct64_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
5417 | 0 | (void)stage_range; |
5418 | 0 | const int32_t* cospi; |
5419 | |
|
5420 | 0 | int32_t *bf0, *bf1; |
5421 | 0 | int32_t step[64]; |
5422 | | |
5423 | | // stage 0; |
5424 | | |
5425 | | // stage 1; |
5426 | 0 | bf1 = output; |
5427 | 0 | bf1[0] = input[0] + input[63]; |
5428 | 0 | bf1[1] = input[1] + input[62]; |
5429 | 0 | bf1[2] = input[2] + input[61]; |
5430 | 0 | bf1[3] = input[3] + input[60]; |
5431 | 0 | bf1[4] = input[4] + input[59]; |
5432 | 0 | bf1[5] = input[5] + input[58]; |
5433 | 0 | bf1[6] = input[6] + input[57]; |
5434 | 0 | bf1[7] = input[7] + input[56]; |
5435 | 0 | bf1[8] = input[8] + input[55]; |
5436 | 0 | bf1[9] = input[9] + input[54]; |
5437 | 0 | bf1[10] = input[10] + input[53]; |
5438 | 0 | bf1[11] = input[11] + input[52]; |
5439 | 0 | bf1[12] = input[12] + input[51]; |
5440 | 0 | bf1[13] = input[13] + input[50]; |
5441 | 0 | bf1[14] = input[14] + input[49]; |
5442 | 0 | bf1[15] = input[15] + input[48]; |
5443 | 0 | bf1[16] = input[16] + input[47]; |
5444 | 0 | bf1[17] = input[17] + input[46]; |
5445 | 0 | bf1[18] = input[18] + input[45]; |
5446 | 0 | bf1[19] = input[19] + input[44]; |
5447 | 0 | bf1[20] = input[20] + input[43]; |
5448 | 0 | bf1[21] = input[21] + input[42]; |
5449 | 0 | bf1[22] = input[22] + input[41]; |
5450 | 0 | bf1[23] = input[23] + input[40]; |
5451 | 0 | bf1[24] = input[24] + input[39]; |
5452 | 0 | bf1[25] = input[25] + input[38]; |
5453 | 0 | bf1[26] = input[26] + input[37]; |
5454 | 0 | bf1[27] = input[27] + input[36]; |
5455 | 0 | bf1[28] = input[28] + input[35]; |
5456 | 0 | bf1[29] = input[29] + input[34]; |
5457 | 0 | bf1[30] = input[30] + input[33]; |
5458 | 0 | bf1[31] = input[31] + input[32]; |
5459 | 0 | bf1[32] = -input[32] + input[31]; |
5460 | 0 | bf1[33] = -input[33] + input[30]; |
5461 | 0 | bf1[34] = -input[34] + input[29]; |
5462 | 0 | bf1[35] = -input[35] + input[28]; |
5463 | 0 | bf1[36] = -input[36] + input[27]; |
5464 | 0 | bf1[37] = -input[37] + input[26]; |
5465 | 0 | bf1[38] = -input[38] + input[25]; |
5466 | 0 | bf1[39] = -input[39] + input[24]; |
5467 | 0 | bf1[40] = -input[40] + input[23]; |
5468 | 0 | bf1[41] = -input[41] + input[22]; |
5469 | 0 | bf1[42] = -input[42] + input[21]; |
5470 | 0 | bf1[43] = -input[43] + input[20]; |
5471 | 0 | bf1[44] = -input[44] + input[19]; |
5472 | 0 | bf1[45] = -input[45] + input[18]; |
5473 | 0 | bf1[46] = -input[46] + input[17]; |
5474 | 0 | bf1[47] = -input[47] + input[16]; |
5475 | 0 | bf1[48] = -input[48] + input[15]; |
5476 | 0 | bf1[49] = -input[49] + input[14]; |
5477 | 0 | bf1[50] = -input[50] + input[13]; |
5478 | 0 | bf1[51] = -input[51] + input[12]; |
5479 | 0 | bf1[52] = -input[52] + input[11]; |
5480 | 0 | bf1[53] = -input[53] + input[10]; |
5481 | 0 | bf1[54] = -input[54] + input[9]; |
5482 | 0 | bf1[55] = -input[55] + input[8]; |
5483 | 0 | bf1[56] = -input[56] + input[7]; |
5484 | 0 | bf1[57] = -input[57] + input[6]; |
5485 | 0 | bf1[58] = -input[58] + input[5]; |
5486 | 0 | bf1[59] = -input[59] + input[4]; |
5487 | 0 | bf1[60] = -input[60] + input[3]; |
5488 | 0 | bf1[61] = -input[61] + input[2]; |
5489 | 0 | bf1[62] = -input[62] + input[1]; |
5490 | 0 | bf1[63] = -input[63] + input[0]; |
5491 | | |
5492 | | // stage 2 |
5493 | 0 | cospi = cospi_arr(cos_bit); |
5494 | 0 | bf0 = output; |
5495 | 0 | bf1 = step; |
5496 | 0 | bf1[0] = bf0[0] + bf0[31]; |
5497 | 0 | bf1[1] = bf0[1] + bf0[30]; |
5498 | 0 | bf1[2] = bf0[2] + bf0[29]; |
5499 | 0 | bf1[3] = bf0[3] + bf0[28]; |
5500 | 0 | bf1[4] = bf0[4] + bf0[27]; |
5501 | 0 | bf1[5] = bf0[5] + bf0[26]; |
5502 | 0 | bf1[6] = bf0[6] + bf0[25]; |
5503 | 0 | bf1[7] = bf0[7] + bf0[24]; |
5504 | 0 | bf1[8] = bf0[8] + bf0[23]; |
5505 | 0 | bf1[9] = bf0[9] + bf0[22]; |
5506 | 0 | bf1[10] = bf0[10] + bf0[21]; |
5507 | 0 | bf1[11] = bf0[11] + bf0[20]; |
5508 | 0 | bf1[12] = bf0[12] + bf0[19]; |
5509 | 0 | bf1[13] = bf0[13] + bf0[18]; |
5510 | 0 | bf1[14] = bf0[14] + bf0[17]; |
5511 | 0 | bf1[15] = bf0[15] + bf0[16]; |
5512 | 0 | bf1[16] = -bf0[16] + bf0[15]; |
5513 | 0 | bf1[17] = -bf0[17] + bf0[14]; |
5514 | 0 | bf1[18] = -bf0[18] + bf0[13]; |
5515 | 0 | bf1[19] = -bf0[19] + bf0[12]; |
5516 | 0 | bf1[20] = -bf0[20] + bf0[11]; |
5517 | 0 | bf1[21] = -bf0[21] + bf0[10]; |
5518 | 0 | bf1[22] = -bf0[22] + bf0[9]; |
5519 | 0 | bf1[23] = -bf0[23] + bf0[8]; |
5520 | 0 | bf1[24] = -bf0[24] + bf0[7]; |
5521 | 0 | bf1[25] = -bf0[25] + bf0[6]; |
5522 | 0 | bf1[26] = -bf0[26] + bf0[5]; |
5523 | 0 | bf1[27] = -bf0[27] + bf0[4]; |
5524 | 0 | bf1[28] = -bf0[28] + bf0[3]; |
5525 | 0 | bf1[29] = -bf0[29] + bf0[2]; |
5526 | 0 | bf1[30] = -bf0[30] + bf0[1]; |
5527 | 0 | bf1[31] = -bf0[31] + bf0[0]; |
5528 | 0 | bf1[32] = bf0[32]; |
5529 | 0 | bf1[33] = bf0[33]; |
5530 | 0 | bf1[34] = bf0[34]; |
5531 | 0 | bf1[35] = bf0[35]; |
5532 | 0 | bf1[36] = bf0[36]; |
5533 | 0 | bf1[37] = bf0[37]; |
5534 | 0 | bf1[38] = bf0[38]; |
5535 | 0 | bf1[39] = bf0[39]; |
5536 | 0 | bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); |
5537 | 0 | bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); |
5538 | 0 | bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); |
5539 | 0 | bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); |
5540 | 0 | bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); |
5541 | 0 | bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); |
5542 | 0 | bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); |
5543 | 0 | bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); |
5544 | 0 | bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); |
5545 | 0 | bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); |
5546 | 0 | bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); |
5547 | 0 | bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); |
5548 | 0 | bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); |
5549 | 0 | bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); |
5550 | 0 | bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); |
5551 | 0 | bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); |
5552 | 0 | bf1[56] = bf0[56]; |
5553 | 0 | bf1[57] = bf0[57]; |
5554 | 0 | bf1[58] = bf0[58]; |
5555 | 0 | bf1[59] = bf0[59]; |
5556 | 0 | bf1[60] = bf0[60]; |
5557 | 0 | bf1[61] = bf0[61]; |
5558 | 0 | bf1[62] = bf0[62]; |
5559 | 0 | bf1[63] = bf0[63]; |
5560 | | |
5561 | | // stage 3 |
5562 | 0 | cospi = cospi_arr(cos_bit); |
5563 | 0 | bf0 = step; |
5564 | 0 | bf1 = output; |
5565 | 0 | bf1[0] = bf0[0] + bf0[15]; |
5566 | 0 | bf1[1] = bf0[1] + bf0[14]; |
5567 | 0 | bf1[2] = bf0[2] + bf0[13]; |
5568 | 0 | bf1[3] = bf0[3] + bf0[12]; |
5569 | 0 | bf1[4] = bf0[4] + bf0[11]; |
5570 | 0 | bf1[5] = bf0[5] + bf0[10]; |
5571 | 0 | bf1[6] = bf0[6] + bf0[9]; |
5572 | 0 | bf1[7] = bf0[7] + bf0[8]; |
5573 | 0 | bf1[8] = -bf0[8] + bf0[7]; |
5574 | 0 | bf1[9] = -bf0[9] + bf0[6]; |
5575 | 0 | bf1[10] = -bf0[10] + bf0[5]; |
5576 | 0 | bf1[11] = -bf0[11] + bf0[4]; |
5577 | 0 | bf1[12] = -bf0[12] + bf0[3]; |
5578 | 0 | bf1[13] = -bf0[13] + bf0[2]; |
5579 | 0 | bf1[14] = -bf0[14] + bf0[1]; |
5580 | 0 | bf1[15] = -bf0[15] + bf0[0]; |
5581 | 0 | bf1[16] = bf0[16]; |
5582 | 0 | bf1[17] = bf0[17]; |
5583 | 0 | bf1[18] = bf0[18]; |
5584 | 0 | bf1[19] = bf0[19]; |
5585 | 0 | bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); |
5586 | 0 | bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); |
5587 | 0 | bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); |
5588 | 0 | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); |
5589 | 0 | bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); |
5590 | 0 | bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); |
5591 | 0 | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); |
5592 | 0 | bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); |
5593 | 0 | bf1[28] = bf0[28]; |
5594 | 0 | bf1[29] = bf0[29]; |
5595 | 0 | bf1[30] = bf0[30]; |
5596 | 0 | bf1[31] = bf0[31]; |
5597 | 0 | bf1[32] = bf0[32] + bf0[47]; |
5598 | 0 | bf1[33] = bf0[33] + bf0[46]; |
5599 | 0 | bf1[34] = bf0[34] + bf0[45]; |
5600 | 0 | bf1[35] = bf0[35] + bf0[44]; |
5601 | 0 | bf1[36] = bf0[36] + bf0[43]; |
5602 | 0 | bf1[37] = bf0[37] + bf0[42]; |
5603 | 0 | bf1[38] = bf0[38] + bf0[41]; |
5604 | 0 | bf1[39] = bf0[39] + bf0[40]; |
5605 | 0 | bf1[40] = -bf0[40] + bf0[39]; |
5606 | 0 | bf1[41] = -bf0[41] + bf0[38]; |
5607 | 0 | bf1[42] = -bf0[42] + bf0[37]; |
5608 | 0 | bf1[43] = -bf0[43] + bf0[36]; |
5609 | 0 | bf1[44] = -bf0[44] + bf0[35]; |
5610 | 0 | bf1[45] = -bf0[45] + bf0[34]; |
5611 | 0 | bf1[46] = -bf0[46] + bf0[33]; |
5612 | 0 | bf1[47] = -bf0[47] + bf0[32]; |
5613 | 0 | bf1[48] = -bf0[48] + bf0[63]; |
5614 | 0 | bf1[49] = -bf0[49] + bf0[62]; |
5615 | 0 | bf1[50] = -bf0[50] + bf0[61]; |
5616 | 0 | bf1[51] = -bf0[51] + bf0[60]; |
5617 | 0 | bf1[52] = -bf0[52] + bf0[59]; |
5618 | 0 | bf1[53] = -bf0[53] + bf0[58]; |
5619 | 0 | bf1[54] = -bf0[54] + bf0[57]; |
5620 | 0 | bf1[55] = -bf0[55] + bf0[56]; |
5621 | 0 | bf1[56] = bf0[56] + bf0[55]; |
5622 | 0 | bf1[57] = bf0[57] + bf0[54]; |
5623 | 0 | bf1[58] = bf0[58] + bf0[53]; |
5624 | 0 | bf1[59] = bf0[59] + bf0[52]; |
5625 | 0 | bf1[60] = bf0[60] + bf0[51]; |
5626 | 0 | bf1[61] = bf0[61] + bf0[50]; |
5627 | 0 | bf1[62] = bf0[62] + bf0[49]; |
5628 | 0 | bf1[63] = bf0[63] + bf0[48]; |
5629 | | |
5630 | | // stage 4 |
5631 | 0 | cospi = cospi_arr(cos_bit); |
5632 | 0 | bf0 = output; |
5633 | 0 | bf1 = step; |
5634 | 0 | bf1[0] = bf0[0] + bf0[7]; |
5635 | 0 | bf1[1] = bf0[1] + bf0[6]; |
5636 | 0 | bf1[2] = bf0[2] + bf0[5]; |
5637 | 0 | bf1[3] = bf0[3] + bf0[4]; |
5638 | 0 | bf1[4] = -bf0[4] + bf0[3]; |
5639 | 0 | bf1[5] = -bf0[5] + bf0[2]; |
5640 | 0 | bf1[6] = -bf0[6] + bf0[1]; |
5641 | 0 | bf1[7] = -bf0[7] + bf0[0]; |
5642 | 0 | bf1[8] = bf0[8]; |
5643 | 0 | bf1[9] = bf0[9]; |
5644 | 0 | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
5645 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
5646 | 0 | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
5647 | 0 | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
5648 | 0 | bf1[14] = bf0[14]; |
5649 | 0 | bf1[15] = bf0[15]; |
5650 | 0 | bf1[16] = bf0[16] + bf0[23]; |
5651 | 0 | bf1[17] = bf0[17] + bf0[22]; |
5652 | 0 | bf1[18] = bf0[18] + bf0[21]; |
5653 | 0 | bf1[19] = bf0[19] + bf0[20]; |
5654 | 0 | bf1[20] = -bf0[20] + bf0[19]; |
5655 | 0 | bf1[21] = -bf0[21] + bf0[18]; |
5656 | 0 | bf1[22] = -bf0[22] + bf0[17]; |
5657 | 0 | bf1[23] = -bf0[23] + bf0[16]; |
5658 | 0 | bf1[24] = -bf0[24] + bf0[31]; |
5659 | 0 | bf1[25] = -bf0[25] + bf0[30]; |
5660 | 0 | bf1[26] = -bf0[26] + bf0[29]; |
5661 | 0 | bf1[27] = -bf0[27] + bf0[28]; |
5662 | 0 | bf1[28] = bf0[28] + bf0[27]; |
5663 | 0 | bf1[29] = bf0[29] + bf0[26]; |
5664 | 0 | bf1[30] = bf0[30] + bf0[25]; |
5665 | 0 | bf1[31] = bf0[31] + bf0[24]; |
5666 | 0 | bf1[32] = bf0[32]; |
5667 | 0 | bf1[33] = bf0[33]; |
5668 | 0 | bf1[34] = bf0[34]; |
5669 | 0 | bf1[35] = bf0[35]; |
5670 | 0 | bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); |
5671 | 0 | bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); |
5672 | 0 | bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); |
5673 | 0 | bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); |
5674 | 0 | bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); |
5675 | 0 | bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); |
5676 | 0 | bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); |
5677 | 0 | bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); |
5678 | 0 | bf1[44] = bf0[44]; |
5679 | 0 | bf1[45] = bf0[45]; |
5680 | 0 | bf1[46] = bf0[46]; |
5681 | 0 | bf1[47] = bf0[47]; |
5682 | 0 | bf1[48] = bf0[48]; |
5683 | 0 | bf1[49] = bf0[49]; |
5684 | 0 | bf1[50] = bf0[50]; |
5685 | 0 | bf1[51] = bf0[51]; |
5686 | 0 | bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); |
5687 | 0 | bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); |
5688 | 0 | bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); |
5689 | 0 | bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); |
5690 | 0 | bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); |
5691 | 0 | bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); |
5692 | 0 | bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); |
5693 | 0 | bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); |
5694 | 0 | bf1[60] = bf0[60]; |
5695 | 0 | bf1[61] = bf0[61]; |
5696 | 0 | bf1[62] = bf0[62]; |
5697 | 0 | bf1[63] = bf0[63]; |
5698 | | |
5699 | | // stage 5 |
5700 | 0 | cospi = cospi_arr(cos_bit); |
5701 | 0 | bf0 = step; |
5702 | 0 | bf1 = output; |
5703 | 0 | bf1[0] = bf0[0] + bf0[3]; |
5704 | 0 | bf1[1] = bf0[1] + bf0[2]; |
5705 | 0 | bf1[2] = -bf0[2] + bf0[1]; |
5706 | 0 | bf1[3] = -bf0[3] + bf0[0]; |
5707 | 0 | bf1[4] = bf0[4]; |
5708 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
5709 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
5710 | 0 | bf1[7] = bf0[7]; |
5711 | 0 | bf1[8] = bf0[8] + bf0[11]; |
5712 | 0 | bf1[9] = bf0[9] + bf0[10]; |
5713 | 0 | bf1[10] = -bf0[10] + bf0[9]; |
5714 | 0 | bf1[11] = -bf0[11] + bf0[8]; |
5715 | 0 | bf1[12] = -bf0[12] + bf0[15]; |
5716 | 0 | bf1[13] = -bf0[13] + bf0[14]; |
5717 | 0 | bf1[14] = bf0[14] + bf0[13]; |
5718 | 0 | bf1[15] = bf0[15] + bf0[12]; |
5719 | 0 | bf1[16] = bf0[16]; |
5720 | 0 | bf1[17] = bf0[17]; |
5721 | 0 | bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); |
5722 | 0 | bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); |
5723 | 0 | bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); |
5724 | 0 | bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); |
5725 | 0 | bf1[22] = bf0[22]; |
5726 | 0 | bf1[23] = bf0[23]; |
5727 | 0 | bf1[24] = bf0[24]; |
5728 | 0 | bf1[25] = bf0[25]; |
5729 | 0 | bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); |
5730 | 0 | bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); |
5731 | 0 | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); |
5732 | 0 | bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); |
5733 | 0 | bf1[30] = bf0[30]; |
5734 | 0 | bf1[31] = bf0[31]; |
5735 | 0 | bf1[32] = bf0[32] + bf0[39]; |
5736 | 0 | bf1[33] = bf0[33] + bf0[38]; |
5737 | 0 | bf1[34] = bf0[34] + bf0[37]; |
5738 | 0 | bf1[35] = bf0[35] + bf0[36]; |
5739 | 0 | bf1[36] = -bf0[36] + bf0[35]; |
5740 | 0 | bf1[37] = -bf0[37] + bf0[34]; |
5741 | 0 | bf1[38] = -bf0[38] + bf0[33]; |
5742 | 0 | bf1[39] = -bf0[39] + bf0[32]; |
5743 | 0 | bf1[40] = -bf0[40] + bf0[47]; |
5744 | 0 | bf1[41] = -bf0[41] + bf0[46]; |
5745 | 0 | bf1[42] = -bf0[42] + bf0[45]; |
5746 | 0 | bf1[43] = -bf0[43] + bf0[44]; |
5747 | 0 | bf1[44] = bf0[44] + bf0[43]; |
5748 | 0 | bf1[45] = bf0[45] + bf0[42]; |
5749 | 0 | bf1[46] = bf0[46] + bf0[41]; |
5750 | 0 | bf1[47] = bf0[47] + bf0[40]; |
5751 | 0 | bf1[48] = bf0[48] + bf0[55]; |
5752 | 0 | bf1[49] = bf0[49] + bf0[54]; |
5753 | 0 | bf1[50] = bf0[50] + bf0[53]; |
5754 | 0 | bf1[51] = bf0[51] + bf0[52]; |
5755 | 0 | bf1[52] = -bf0[52] + bf0[51]; |
5756 | 0 | bf1[53] = -bf0[53] + bf0[50]; |
5757 | 0 | bf1[54] = -bf0[54] + bf0[49]; |
5758 | 0 | bf1[55] = -bf0[55] + bf0[48]; |
5759 | 0 | bf1[56] = -bf0[56] + bf0[63]; |
5760 | 0 | bf1[57] = -bf0[57] + bf0[62]; |
5761 | 0 | bf1[58] = -bf0[58] + bf0[61]; |
5762 | 0 | bf1[59] = -bf0[59] + bf0[60]; |
5763 | 0 | bf1[60] = bf0[60] + bf0[59]; |
5764 | 0 | bf1[61] = bf0[61] + bf0[58]; |
5765 | 0 | bf1[62] = bf0[62] + bf0[57]; |
5766 | 0 | bf1[63] = bf0[63] + bf0[56]; |
5767 | | |
5768 | | // stage 6 |
5769 | 0 | cospi = cospi_arr(cos_bit); |
5770 | 0 | bf0 = output; |
5771 | 0 | bf1 = step; |
5772 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
5773 | 0 | bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); |
5774 | 0 | bf1[4] = bf0[4] + bf0[5]; |
5775 | 0 | bf1[5] = -bf0[5] + bf0[4]; |
5776 | 0 | bf1[6] = -bf0[6] + bf0[7]; |
5777 | 0 | bf1[7] = bf0[7] + bf0[6]; |
5778 | 0 | bf1[8] = bf0[8]; |
5779 | 0 | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
5780 | 0 | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
5781 | 0 | bf1[11] = bf0[11]; |
5782 | 0 | bf1[12] = bf0[12]; |
5783 | 0 | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
5784 | 0 | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
5785 | 0 | bf1[15] = bf0[15]; |
5786 | 0 | bf1[16] = bf0[16] + bf0[19]; |
5787 | 0 | bf1[17] = bf0[17] + bf0[18]; |
5788 | 0 | bf1[18] = -bf0[18] + bf0[17]; |
5789 | 0 | bf1[19] = -bf0[19] + bf0[16]; |
5790 | 0 | bf1[20] = -bf0[20] + bf0[23]; |
5791 | 0 | bf1[21] = -bf0[21] + bf0[22]; |
5792 | 0 | bf1[22] = bf0[22] + bf0[21]; |
5793 | 0 | bf1[23] = bf0[23] + bf0[20]; |
5794 | 0 | bf1[24] = bf0[24] + bf0[27]; |
5795 | 0 | bf1[25] = bf0[25] + bf0[26]; |
5796 | 0 | bf1[26] = -bf0[26] + bf0[25]; |
5797 | 0 | bf1[27] = -bf0[27] + bf0[24]; |
5798 | 0 | bf1[28] = -bf0[28] + bf0[31]; |
5799 | 0 | bf1[29] = -bf0[29] + bf0[30]; |
5800 | 0 | bf1[30] = bf0[30] + bf0[29]; |
5801 | 0 | bf1[31] = bf0[31] + bf0[28]; |
5802 | 0 | bf1[32] = bf0[32]; |
5803 | 0 | bf1[33] = bf0[33]; |
5804 | 0 | bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); |
5805 | 0 | bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); |
5806 | 0 | bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); |
5807 | 0 | bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); |
5808 | 0 | bf1[38] = bf0[38]; |
5809 | 0 | bf1[39] = bf0[39]; |
5810 | 0 | bf1[40] = bf0[40]; |
5811 | 0 | bf1[41] = bf0[41]; |
5812 | 0 | bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); |
5813 | 0 | bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); |
5814 | 0 | bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); |
5815 | 0 | bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); |
5816 | 0 | bf1[46] = bf0[46]; |
5817 | 0 | bf1[47] = bf0[47]; |
5818 | 0 | bf1[48] = bf0[48]; |
5819 | 0 | bf1[49] = bf0[49]; |
5820 | 0 | bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); |
5821 | 0 | bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); |
5822 | 0 | bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); |
5823 | 0 | bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); |
5824 | 0 | bf1[54] = bf0[54]; |
5825 | 0 | bf1[55] = bf0[55]; |
5826 | 0 | bf1[56] = bf0[56]; |
5827 | 0 | bf1[57] = bf0[57]; |
5828 | 0 | bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); |
5829 | 0 | bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); |
5830 | 0 | bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); |
5831 | 0 | bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); |
5832 | 0 | bf1[62] = bf0[62]; |
5833 | 0 | bf1[63] = bf0[63]; |
5834 | | |
5835 | | // stage 7 |
5836 | 0 | cospi = cospi_arr(cos_bit); |
5837 | 0 | bf0 = step; |
5838 | 0 | bf1 = output; |
5839 | 0 | bf1[0] = bf0[0]; |
5840 | 0 | bf1[2] = bf0[2]; |
5841 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
5842 | 0 | bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); |
5843 | 0 | bf1[8] = bf0[8] + bf0[9]; |
5844 | 0 | bf1[9] = -bf0[9] + bf0[8]; |
5845 | 0 | bf1[10] = -bf0[10] + bf0[11]; |
5846 | 0 | bf1[11] = bf0[11] + bf0[10]; |
5847 | 0 | bf1[12] = bf0[12] + bf0[13]; |
5848 | 0 | bf1[13] = -bf0[13] + bf0[12]; |
5849 | 0 | bf1[14] = -bf0[14] + bf0[15]; |
5850 | 0 | bf1[15] = bf0[15] + bf0[14]; |
5851 | 0 | bf1[16] = bf0[16]; |
5852 | 0 | bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); |
5853 | 0 | bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); |
5854 | 0 | bf1[19] = bf0[19]; |
5855 | 0 | bf1[20] = bf0[20]; |
5856 | 0 | bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); |
5857 | 0 | bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); |
5858 | 0 | bf1[23] = bf0[23]; |
5859 | 0 | bf1[24] = bf0[24]; |
5860 | 0 | bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); |
5861 | 0 | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); |
5862 | 0 | bf1[27] = bf0[27]; |
5863 | 0 | bf1[28] = bf0[28]; |
5864 | 0 | bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); |
5865 | 0 | bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); |
5866 | 0 | bf1[31] = bf0[31]; |
5867 | 0 | bf1[32] = bf0[32] + bf0[35]; |
5868 | 0 | bf1[33] = bf0[33] + bf0[34]; |
5869 | 0 | bf1[34] = -bf0[34] + bf0[33]; |
5870 | 0 | bf1[35] = -bf0[35] + bf0[32]; |
5871 | 0 | bf1[36] = -bf0[36] + bf0[39]; |
5872 | 0 | bf1[37] = -bf0[37] + bf0[38]; |
5873 | 0 | bf1[38] = bf0[38] + bf0[37]; |
5874 | 0 | bf1[39] = bf0[39] + bf0[36]; |
5875 | 0 | bf1[40] = bf0[40] + bf0[43]; |
5876 | 0 | bf1[41] = bf0[41] + bf0[42]; |
5877 | 0 | bf1[42] = -bf0[42] + bf0[41]; |
5878 | 0 | bf1[43] = -bf0[43] + bf0[40]; |
5879 | 0 | bf1[44] = -bf0[44] + bf0[47]; |
5880 | 0 | bf1[45] = -bf0[45] + bf0[46]; |
5881 | 0 | bf1[46] = bf0[46] + bf0[45]; |
5882 | 0 | bf1[47] = bf0[47] + bf0[44]; |
5883 | 0 | bf1[48] = bf0[48] + bf0[51]; |
5884 | 0 | bf1[49] = bf0[49] + bf0[50]; |
5885 | 0 | bf1[50] = -bf0[50] + bf0[49]; |
5886 | 0 | bf1[51] = -bf0[51] + bf0[48]; |
5887 | 0 | bf1[52] = -bf0[52] + bf0[55]; |
5888 | 0 | bf1[53] = -bf0[53] + bf0[54]; |
5889 | 0 | bf1[54] = bf0[54] + bf0[53]; |
5890 | 0 | bf1[55] = bf0[55] + bf0[52]; |
5891 | 0 | bf1[56] = bf0[56] + bf0[59]; |
5892 | 0 | bf1[57] = bf0[57] + bf0[58]; |
5893 | 0 | bf1[58] = -bf0[58] + bf0[57]; |
5894 | 0 | bf1[59] = -bf0[59] + bf0[56]; |
5895 | 0 | bf1[60] = -bf0[60] + bf0[63]; |
5896 | 0 | bf1[61] = -bf0[61] + bf0[62]; |
5897 | 0 | bf1[62] = bf0[62] + bf0[61]; |
5898 | 0 | bf1[63] = bf0[63] + bf0[60]; |
5899 | | |
5900 | | // stage 8 |
5901 | 0 | cospi = cospi_arr(cos_bit); |
5902 | 0 | bf0 = output; |
5903 | 0 | bf1 = step; |
5904 | 0 | bf1[0] = bf0[0]; |
5905 | 0 | bf1[2] = bf0[2]; |
5906 | 0 | bf1[4] = bf0[4]; |
5907 | 0 | bf1[6] = bf0[6]; |
5908 | 0 | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
5909 | 0 | bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); |
5910 | 0 | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
5911 | 0 | bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); |
5912 | 0 | bf1[16] = bf0[16] + bf0[17]; |
5913 | 0 | bf1[17] = -bf0[17] + bf0[16]; |
5914 | 0 | bf1[18] = -bf0[18] + bf0[19]; |
5915 | 0 | bf1[19] = bf0[19] + bf0[18]; |
5916 | 0 | bf1[20] = bf0[20] + bf0[21]; |
5917 | 0 | bf1[21] = -bf0[21] + bf0[20]; |
5918 | 0 | bf1[22] = -bf0[22] + bf0[23]; |
5919 | 0 | bf1[23] = bf0[23] + bf0[22]; |
5920 | 0 | bf1[24] = bf0[24] + bf0[25]; |
5921 | 0 | bf1[25] = -bf0[25] + bf0[24]; |
5922 | 0 | bf1[26] = -bf0[26] + bf0[27]; |
5923 | 0 | bf1[27] = bf0[27] + bf0[26]; |
5924 | 0 | bf1[28] = bf0[28] + bf0[29]; |
5925 | 0 | bf1[29] = -bf0[29] + bf0[28]; |
5926 | 0 | bf1[30] = -bf0[30] + bf0[31]; |
5927 | 0 | bf1[31] = bf0[31] + bf0[30]; |
5928 | 0 | bf1[32] = bf0[32]; |
5929 | 0 | bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); |
5930 | 0 | bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); |
5931 | 0 | bf1[35] = bf0[35]; |
5932 | 0 | bf1[36] = bf0[36]; |
5933 | 0 | bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); |
5934 | 0 | bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); |
5935 | 0 | bf1[39] = bf0[39]; |
5936 | 0 | bf1[40] = bf0[40]; |
5937 | 0 | bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); |
5938 | 0 | bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); |
5939 | 0 | bf1[43] = bf0[43]; |
5940 | 0 | bf1[44] = bf0[44]; |
5941 | 0 | bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); |
5942 | 0 | bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); |
5943 | 0 | bf1[47] = bf0[47]; |
5944 | 0 | bf1[48] = bf0[48]; |
5945 | 0 | bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); |
5946 | 0 | bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); |
5947 | 0 | bf1[51] = bf0[51]; |
5948 | 0 | bf1[52] = bf0[52]; |
5949 | 0 | bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); |
5950 | 0 | bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); |
5951 | 0 | bf1[55] = bf0[55]; |
5952 | 0 | bf1[56] = bf0[56]; |
5953 | 0 | bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); |
5954 | 0 | bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); |
5955 | 0 | bf1[59] = bf0[59]; |
5956 | 0 | bf1[60] = bf0[60]; |
5957 | 0 | bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); |
5958 | 0 | bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); |
5959 | 0 | bf1[63] = bf0[63]; |
5960 | | |
5961 | | // stage 9 |
5962 | 0 | cospi = cospi_arr(cos_bit); |
5963 | 0 | bf0 = step; |
5964 | 0 | bf1 = output; |
5965 | 0 | bf1[0] = bf0[0]; |
5966 | 0 | bf1[2] = bf0[2]; |
5967 | 0 | bf1[4] = bf0[4]; |
5968 | 0 | bf1[6] = bf0[6]; |
5969 | 0 | bf1[8] = bf0[8]; |
5970 | 0 | bf1[10] = bf0[10]; |
5971 | 0 | bf1[12] = bf0[12]; |
5972 | 0 | bf1[14] = bf0[14]; |
5973 | 0 | bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); |
5974 | 0 | bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); |
5975 | 0 | bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); |
5976 | 0 | bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); |
5977 | 0 | bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); |
5978 | 0 | bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); |
5979 | 0 | bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); |
5980 | 0 | bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); |
5981 | 0 | bf1[32] = bf0[32] + bf0[33]; |
5982 | 0 | bf1[33] = -bf0[33] + bf0[32]; |
5983 | 0 | bf1[34] = -bf0[34] + bf0[35]; |
5984 | 0 | bf1[35] = bf0[35] + bf0[34]; |
5985 | 0 | bf1[36] = bf0[36] + bf0[37]; |
5986 | 0 | bf1[37] = -bf0[37] + bf0[36]; |
5987 | 0 | bf1[38] = -bf0[38] + bf0[39]; |
5988 | 0 | bf1[39] = bf0[39] + bf0[38]; |
5989 | 0 | bf1[40] = bf0[40] + bf0[41]; |
5990 | 0 | bf1[41] = -bf0[41] + bf0[40]; |
5991 | 0 | bf1[42] = -bf0[42] + bf0[43]; |
5992 | 0 | bf1[43] = bf0[43] + bf0[42]; |
5993 | 0 | bf1[44] = bf0[44] + bf0[45]; |
5994 | 0 | bf1[45] = -bf0[45] + bf0[44]; |
5995 | 0 | bf1[46] = -bf0[46] + bf0[47]; |
5996 | 0 | bf1[47] = bf0[47] + bf0[46]; |
5997 | 0 | bf1[48] = bf0[48] + bf0[49]; |
5998 | 0 | bf1[49] = -bf0[49] + bf0[48]; |
5999 | 0 | bf1[50] = -bf0[50] + bf0[51]; |
6000 | 0 | bf1[51] = bf0[51] + bf0[50]; |
6001 | 0 | bf1[52] = bf0[52] + bf0[53]; |
6002 | 0 | bf1[53] = -bf0[53] + bf0[52]; |
6003 | 0 | bf1[54] = -bf0[54] + bf0[55]; |
6004 | 0 | bf1[55] = bf0[55] + bf0[54]; |
6005 | 0 | bf1[56] = bf0[56] + bf0[57]; |
6006 | 0 | bf1[57] = -bf0[57] + bf0[56]; |
6007 | 0 | bf1[58] = -bf0[58] + bf0[59]; |
6008 | 0 | bf1[59] = bf0[59] + bf0[58]; |
6009 | 0 | bf1[60] = bf0[60] + bf0[61]; |
6010 | 0 | bf1[61] = -bf0[61] + bf0[60]; |
6011 | 0 | bf1[62] = -bf0[62] + bf0[63]; |
6012 | 0 | bf1[63] = bf0[63] + bf0[62]; |
6013 | | |
6014 | | // stage 10 |
6015 | 0 | cospi = cospi_arr(cos_bit); |
6016 | 0 | bf0 = output; |
6017 | 0 | bf1 = step; |
6018 | 0 | bf1[0] = bf0[0]; |
6019 | 0 | bf1[2] = bf0[2]; |
6020 | 0 | bf1[4] = bf0[4]; |
6021 | 0 | bf1[6] = bf0[6]; |
6022 | 0 | bf1[8] = bf0[8]; |
6023 | 0 | bf1[10] = bf0[10]; |
6024 | 0 | bf1[12] = bf0[12]; |
6025 | 0 | bf1[14] = bf0[14]; |
6026 | 0 | bf1[16] = bf0[16]; |
6027 | 0 | bf1[18] = bf0[18]; |
6028 | 0 | bf1[20] = bf0[20]; |
6029 | 0 | bf1[22] = bf0[22]; |
6030 | 0 | bf1[24] = bf0[24]; |
6031 | 0 | bf1[26] = bf0[26]; |
6032 | 0 | bf1[28] = bf0[28]; |
6033 | 0 | bf1[30] = bf0[30]; |
6034 | 0 | bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); |
6035 | 0 | bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); |
6036 | 0 | bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); |
6037 | 0 | bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); |
6038 | 0 | bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); |
6039 | 0 | bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); |
6040 | 0 | bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); |
6041 | 0 | bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); |
6042 | 0 | bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); |
6043 | 0 | bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); |
6044 | 0 | bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); |
6045 | 0 | bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); |
6046 | 0 | bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); |
6047 | 0 | bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); |
6048 | 0 | bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); |
6049 | 0 | bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); |
6050 | | |
6051 | | // stage 11 |
6052 | 0 | bf0 = step; |
6053 | 0 | bf1 = output; |
6054 | 0 | bf1[0] = bf0[0]; |
6055 | 0 | bf1[1] = bf0[32]; |
6056 | 0 | bf1[2] = bf0[16]; |
6057 | 0 | bf1[3] = bf0[48]; |
6058 | 0 | bf1[4] = bf0[8]; |
6059 | 0 | bf1[5] = bf0[40]; |
6060 | 0 | bf1[6] = bf0[24]; |
6061 | 0 | bf1[7] = bf0[56]; |
6062 | 0 | bf1[8] = bf0[4]; |
6063 | 0 | bf1[9] = bf0[36]; |
6064 | 0 | bf1[10] = bf0[20]; |
6065 | 0 | bf1[11] = bf0[52]; |
6066 | 0 | bf1[12] = bf0[12]; |
6067 | 0 | bf1[13] = bf0[44]; |
6068 | 0 | bf1[14] = bf0[28]; |
6069 | 0 | bf1[15] = bf0[60]; |
6070 | 0 | bf1[16] = bf0[2]; |
6071 | 0 | bf1[17] = bf0[34]; |
6072 | 0 | bf1[18] = bf0[18]; |
6073 | 0 | bf1[19] = bf0[50]; |
6074 | 0 | bf1[20] = bf0[10]; |
6075 | 0 | bf1[21] = bf0[42]; |
6076 | 0 | bf1[22] = bf0[26]; |
6077 | 0 | bf1[23] = bf0[58]; |
6078 | 0 | bf1[24] = bf0[6]; |
6079 | 0 | bf1[25] = bf0[38]; |
6080 | 0 | bf1[26] = bf0[22]; |
6081 | 0 | bf1[27] = bf0[54]; |
6082 | 0 | bf1[28] = bf0[14]; |
6083 | 0 | bf1[29] = bf0[46]; |
6084 | 0 | bf1[30] = bf0[30]; |
6085 | 0 | bf1[31] = bf0[62]; |
6086 | 0 | } |
6087 | | |
6088 | 0 | static void av1_fidentity64_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6089 | 0 | (void)stage_range; |
6090 | 0 | (void)cos_bit; |
6091 | 0 | for (int32_t i = 0; i < 32; ++i) { |
6092 | 0 | output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits); |
6093 | 0 | } |
6094 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
6095 | 0 | } |
6096 | | |
6097 | 0 | static INLINE TxfmFunc fwd_txfm_type_to_func_N2(TxfmType txfmtype) { |
6098 | 0 | switch (txfmtype) { |
6099 | 0 | case TXFM_TYPE_DCT4: |
6100 | 0 | return svt_av1_fdct4_new_N2; |
6101 | 0 | case TXFM_TYPE_DCT8: |
6102 | 0 | return svt_av1_fdct8_new_N2; |
6103 | 0 | case TXFM_TYPE_DCT16: |
6104 | 0 | return svt_av1_fdct16_new_N2; |
6105 | 0 | case TXFM_TYPE_DCT32: |
6106 | 0 | return svt_av1_fdct32_new_N2; |
6107 | 0 | case TXFM_TYPE_DCT64: |
6108 | 0 | return svt_av1_fdct64_new_N2; |
6109 | 0 | case TXFM_TYPE_ADST4: |
6110 | 0 | return svt_av1_fadst4_new_N2; |
6111 | 0 | case TXFM_TYPE_ADST8: |
6112 | 0 | return svt_av1_fadst8_new_N2; |
6113 | 0 | case TXFM_TYPE_ADST16: |
6114 | 0 | return svt_av1_fadst16_new_N2; |
6115 | 0 | case TXFM_TYPE_ADST32: |
6116 | 0 | return av1_fadst32_new; |
6117 | 0 | case TXFM_TYPE_IDENTITY4: |
6118 | 0 | return svt_av1_fidentity4_N2_c; |
6119 | 0 | case TXFM_TYPE_IDENTITY8: |
6120 | 0 | return svt_av1_fidentity8_N2_c; |
6121 | 0 | case TXFM_TYPE_IDENTITY16: |
6122 | 0 | return svt_av1_fidentity16_N2_c; |
6123 | 0 | case TXFM_TYPE_IDENTITY32: |
6124 | 0 | return svt_av1_fidentity32_N2_c; |
6125 | 0 | case TXFM_TYPE_IDENTITY64: |
6126 | 0 | return av1_fidentity64_N2_c; |
6127 | 0 | default: |
6128 | 0 | assert(0); |
6129 | 0 | return NULL; |
6130 | 0 | } |
6131 | 0 | } |
6132 | | |
6133 | | static INLINE void av1_tranform_two_d_core_N2_c(int16_t* input, uint32_t input_stride, int32_t* output, |
6134 | 0 | const Txfm2dFlipCfg* cfg, int32_t* buf, uint8_t bit_depth) { |
6135 | 0 | int32_t c, r; |
6136 | | // Note when assigning txfm_size_col, we use the txfm_size from the |
6137 | | // row configuration and vice versa. This is intentionally done to |
6138 | | // accurately perform rectangular transforms. When the transform is |
6139 | | // rectangular, the number of columns will be the same as the |
6140 | | // txfm_size stored in the row cfg struct. It will make no difference |
6141 | | // for square transforms. |
6142 | 0 | const int32_t txfm_size_col = tx_size_wide[cfg->tx_size]; |
6143 | 0 | const int32_t txfm_size_row = tx_size_high[cfg->tx_size]; |
6144 | | // Take the shift from the larger dimension in the rectangular case. |
6145 | 0 | const int8_t* shift = cfg->shift; |
6146 | 0 | const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
6147 | 0 | int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; |
6148 | 0 | int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; |
6149 | 0 | assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); |
6150 | 0 | assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); |
6151 | 0 | svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth); |
6152 | |
|
6153 | 0 | const int8_t cos_bit_col = cfg->cos_bit_col; |
6154 | 0 | const int8_t cos_bit_row = cfg->cos_bit_row; |
6155 | 0 | const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N2(cfg->txfm_type_col); |
6156 | 0 | const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N2(cfg->txfm_type_row); |
6157 | 0 | ASSERT(txfm_func_col != NULL); |
6158 | 0 | ASSERT(txfm_func_row != NULL); |
6159 | | // use output buffer as temp buffer |
6160 | 0 | int32_t* temp_in = output; |
6161 | 0 | int32_t* temp_out = output + txfm_size_row; |
6162 | | |
6163 | | // Columns |
6164 | 0 | for (c = 0; c < txfm_size_col; ++c) { |
6165 | 0 | if (cfg->ud_flip == 0) { |
6166 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
6167 | 0 | temp_in[r] = input[r * input_stride + c]; |
6168 | 0 | } |
6169 | 0 | } else { |
6170 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
6171 | | // flip upside down |
6172 | 0 | temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c]; |
6173 | 0 | } |
6174 | 0 | } |
6175 | 0 | svt_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c |
6176 | 0 | txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); |
6177 | 0 | svt_av1_round_shift_array_c(temp_out, txfm_size_row / 2, -shift[1]); // NM svt_av1_round_shift_array_c |
6178 | 0 | if (cfg->lr_flip == 0) { |
6179 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
6180 | 0 | buf[r * txfm_size_col + c] = temp_out[r]; |
6181 | 0 | } |
6182 | 0 | } else { |
6183 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
6184 | | // flip from left to right |
6185 | 0 | buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; |
6186 | 0 | } |
6187 | 0 | } |
6188 | 0 | } |
6189 | | |
6190 | | // Rows |
6191 | 0 | for (r = 0; r < txfm_size_row / 2; ++r) { |
6192 | 0 | txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row); |
6193 | 0 | svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 2, -shift[2]); |
6194 | |
|
6195 | 0 | if (abs(rect_type) == 1) { |
6196 | | // Multiply everything by Sqrt2 if the transform is rectangular and the |
6197 | | // size difference is a factor of 2. |
6198 | 0 | for (c = 0; c < txfm_size_col / 2; ++c) { |
6199 | 0 | output[r * txfm_size_col + c] = round_shift((int64_t)output[r * txfm_size_col + c] * new_sqrt2, |
6200 | 0 | new_sqrt2_bits); |
6201 | 0 | } |
6202 | 0 | } |
6203 | 0 | } |
6204 | |
|
6205 | 0 | for (int i = 0; i < (txfm_size_col * txfm_size_row); i++) { |
6206 | 0 | if (i % txfm_size_col >= (txfm_size_col >> 1) || i / txfm_size_col >= (txfm_size_row >> 1)) { |
6207 | 0 | output[i] = 0; |
6208 | 0 | } |
6209 | 0 | } |
6210 | 0 | } |
6211 | | |
6212 | | void svt_aom_transform_two_d_64x64_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6213 | 0 | uint8_t bit_depth) { |
6214 | 0 | int32_t intermediate_transform_buffer[64 * 64]; |
6215 | 0 | Txfm2dFlipCfg cfg; |
6216 | 0 | svt_aom_transform_config(transform_type, TX_64X64, &cfg); |
6217 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6218 | 0 | } |
6219 | | |
6220 | | void svt_aom_transform_two_d_32x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6221 | 0 | uint8_t bit_depth) { |
6222 | 0 | int32_t intermediate_transform_buffer[32 * 32]; |
6223 | 0 | Txfm2dFlipCfg cfg; |
6224 | 0 | svt_aom_transform_config(transform_type, TX_32X32, &cfg); |
6225 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6226 | 0 | } |
6227 | | |
6228 | | void svt_aom_transform_two_d_16x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6229 | 0 | uint8_t bit_depth) { |
6230 | 0 | int32_t intermediate_transform_buffer[16 * 16]; |
6231 | 0 | Txfm2dFlipCfg cfg; |
6232 | 0 | svt_aom_transform_config(transform_type, TX_16X16, &cfg); |
6233 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6234 | 0 | } |
6235 | | |
6236 | | void svt_aom_transform_two_d_8x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6237 | 0 | uint8_t bit_depth) { |
6238 | 0 | int32_t intermediate_transform_buffer[8 * 8]; |
6239 | 0 | Txfm2dFlipCfg cfg; |
6240 | 0 | svt_aom_transform_config(transform_type, TX_8X8, &cfg); |
6241 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6242 | 0 | } |
6243 | | |
6244 | | void svt_aom_transform_two_d_4x4_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6245 | 0 | uint8_t bit_depth) { |
6246 | 0 | int32_t intermediate_transform_buffer[4 * 4]; |
6247 | 0 | Txfm2dFlipCfg cfg; |
6248 | 0 | svt_aom_transform_config(transform_type, TX_4X4, &cfg); |
6249 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6250 | 0 | } |
6251 | | |
6252 | | void svt_av1_fwd_txfm2d_64x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6253 | 0 | uint8_t bit_depth) { |
6254 | 0 | int32_t intermediate_transform_buffer[64 * 32]; |
6255 | 0 | Txfm2dFlipCfg cfg; |
6256 | 0 | svt_aom_transform_config(transform_type, TX_64X32, &cfg); |
6257 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6258 | 0 | } |
6259 | | |
6260 | | void svt_av1_fwd_txfm2d_32x64_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6261 | 0 | uint8_t bit_depth) { |
6262 | 0 | int32_t intermediate_transform_buffer[32 * 64]; |
6263 | 0 | Txfm2dFlipCfg cfg; |
6264 | 0 | svt_aom_transform_config(transform_type, TX_32X64, &cfg); |
6265 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6266 | 0 | } |
6267 | | |
6268 | | void svt_av1_fwd_txfm2d_64x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6269 | 0 | uint8_t bit_depth) { |
6270 | 0 | int32_t intermediate_transform_buffer[64 * 16]; |
6271 | 0 | Txfm2dFlipCfg cfg; |
6272 | 0 | svt_aom_transform_config(transform_type, TX_64X16, &cfg); |
6273 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6274 | 0 | } |
6275 | | |
6276 | | void svt_av1_fwd_txfm2d_16x64_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6277 | 0 | uint8_t bit_depth) { |
6278 | 0 | int32_t intermediate_transform_buffer[16 * 64]; |
6279 | 0 | Txfm2dFlipCfg cfg; |
6280 | 0 | svt_aom_transform_config(transform_type, TX_16X64, &cfg); |
6281 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6282 | 0 | } |
6283 | | |
6284 | | void svt_av1_fwd_txfm2d_32x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6285 | 0 | uint8_t bit_depth) { |
6286 | 0 | int32_t intermediate_transform_buffer[32 * 16]; |
6287 | 0 | Txfm2dFlipCfg cfg; |
6288 | 0 | svt_aom_transform_config(transform_type, TX_32X16, &cfg); |
6289 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6290 | 0 | } |
6291 | | |
6292 | | void svt_av1_fwd_txfm2d_16x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6293 | 0 | uint8_t bit_depth) { |
6294 | 0 | int32_t intermediate_transform_buffer[16 * 32]; |
6295 | 0 | Txfm2dFlipCfg cfg; |
6296 | 0 | svt_aom_transform_config(transform_type, TX_16X32, &cfg); |
6297 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6298 | 0 | } |
6299 | | |
6300 | | void svt_av1_fwd_txfm2d_16x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6301 | 0 | uint8_t bit_depth) { |
6302 | 0 | int32_t intermediate_transform_buffer[16 * 8]; |
6303 | 0 | Txfm2dFlipCfg cfg; |
6304 | 0 | svt_aom_transform_config(transform_type, TX_16X8, &cfg); |
6305 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6306 | 0 | } |
6307 | | |
6308 | | void svt_av1_fwd_txfm2d_8x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6309 | 0 | uint8_t bit_depth) { |
6310 | 0 | int32_t intermediate_transform_buffer[8 * 16]; |
6311 | 0 | Txfm2dFlipCfg cfg; |
6312 | 0 | svt_aom_transform_config(transform_type, TX_8X16, &cfg); |
6313 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6314 | 0 | } |
6315 | | |
6316 | | void svt_av1_fwd_txfm2d_32x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6317 | 0 | uint8_t bit_depth) { |
6318 | 0 | int32_t intermediate_transform_buffer[32 * 8]; |
6319 | 0 | Txfm2dFlipCfg cfg; |
6320 | 0 | svt_aom_transform_config(transform_type, TX_32X8, &cfg); |
6321 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6322 | 0 | } |
6323 | | |
6324 | | void svt_av1_fwd_txfm2d_8x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6325 | 0 | uint8_t bit_depth) { |
6326 | 0 | int32_t intermediate_transform_buffer[8 * 32]; |
6327 | 0 | Txfm2dFlipCfg cfg; |
6328 | 0 | svt_aom_transform_config(transform_type, TX_8X32, &cfg); |
6329 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6330 | 0 | } |
6331 | | |
6332 | | void svt_av1_fwd_txfm2d_16x4_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6333 | 0 | uint8_t bit_depth) { |
6334 | 0 | int32_t intermediate_transform_buffer[16 * 4]; |
6335 | 0 | Txfm2dFlipCfg cfg; |
6336 | 0 | svt_aom_transform_config(transform_type, TX_16X4, &cfg); |
6337 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6338 | 0 | } |
6339 | | |
6340 | | void svt_av1_fwd_txfm2d_4x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6341 | 0 | uint8_t bit_depth) { |
6342 | 0 | int32_t intermediate_transform_buffer[4 * 16]; |
6343 | 0 | Txfm2dFlipCfg cfg; |
6344 | 0 | svt_aom_transform_config(transform_type, TX_4X16, &cfg); |
6345 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6346 | 0 | } |
6347 | | |
6348 | | void svt_av1_fwd_txfm2d_8x4_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6349 | 0 | uint8_t bit_depth) { |
6350 | 0 | int32_t intermediate_transform_buffer[8 * 4]; |
6351 | 0 | Txfm2dFlipCfg cfg; |
6352 | 0 | svt_aom_transform_config(transform_type, TX_8X4, &cfg); |
6353 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6354 | 0 | } |
6355 | | |
6356 | | void svt_av1_fwd_txfm2d_4x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
6357 | 0 | uint8_t bit_depth) { |
6358 | 0 | int32_t intermediate_transform_buffer[4 * 8]; |
6359 | 0 | Txfm2dFlipCfg cfg; |
6360 | 0 | svt_aom_transform_config(transform_type, TX_4X8, &cfg); |
6361 | 0 | av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
6362 | 0 | } |
6363 | | |
6364 | 0 | void svt_av1_fdct4_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6365 | 0 | (void)stage_range; |
6366 | 0 | const int32_t* cospi = cospi_arr(cos_bit); |
6367 | 0 | int32_t step[2]; |
6368 | | |
6369 | | // stage 1; |
6370 | 0 | step[0] = input[0] + input[3]; |
6371 | 0 | step[1] = input[1] + input[2]; |
6372 | |
|
6373 | 0 | output[0] = half_btf(cospi[32], step[0], cospi[32], step[1], cos_bit); |
6374 | 0 | } |
6375 | | |
6376 | 0 | void svt_av1_fadst4_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6377 | 0 | (void)stage_range; |
6378 | 0 | int32_t bit = cos_bit; |
6379 | 0 | const int32_t* sinpi = sinpi_arr(bit); |
6380 | 0 | int32_t x0, x1, x2, x3; |
6381 | 0 | int32_t s0, s2, s4, s5; |
6382 | | |
6383 | | // stage 0 |
6384 | 0 | x0 = input[0]; |
6385 | 0 | x1 = input[1]; |
6386 | 0 | x2 = input[2]; |
6387 | 0 | x3 = input[3]; |
6388 | |
|
6389 | 0 | if (!(x0 | x1 | x2 | x3)) { |
6390 | 0 | output[0] = output[1] = output[2] = output[3] = 0; |
6391 | 0 | return; |
6392 | 0 | } |
6393 | | |
6394 | | // stage 1 |
6395 | 0 | s0 = sinpi[1] * x0; |
6396 | 0 | s2 = sinpi[2] * x1; |
6397 | 0 | s4 = sinpi[3] * x2; |
6398 | 0 | s5 = sinpi[4] * x3; |
6399 | | |
6400 | | // stage 3 |
6401 | 0 | x0 = s0 + s2; |
6402 | | |
6403 | | // stage 4 |
6404 | 0 | x0 = x0 + s5; |
6405 | | |
6406 | | // stage 5 |
6407 | 0 | s0 = x0 + s4; |
6408 | | |
6409 | | // 1-D transform scaling factor is sqrt(2). |
6410 | 0 | output[0] = round_shift(s0, bit); |
6411 | 0 | } |
6412 | | |
6413 | 0 | void svt_av1_fidentity4_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6414 | 0 | (void)stage_range; |
6415 | 0 | (void)cos_bit; |
6416 | 0 | output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits); |
6417 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
6418 | 0 | } |
6419 | | |
6420 | 0 | void svt_av1_fdct8_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6421 | 0 | (void)stage_range; |
6422 | 0 | const int32_t* cospi; |
6423 | |
|
6424 | 0 | int32_t *bf0, *bf1; |
6425 | 0 | int32_t step[8]; |
6426 | | |
6427 | | // stage 0; |
6428 | | |
6429 | | // stage 1; |
6430 | 0 | bf1 = output; |
6431 | 0 | bf1[0] = input[0] + input[7]; |
6432 | 0 | bf1[1] = input[1] + input[6]; |
6433 | 0 | bf1[2] = input[2] + input[5]; |
6434 | 0 | bf1[3] = input[3] + input[4]; |
6435 | 0 | bf1[4] = -input[4] + input[3]; |
6436 | 0 | bf1[5] = -input[5] + input[2]; |
6437 | 0 | bf1[6] = -input[6] + input[1]; |
6438 | 0 | bf1[7] = -input[7] + input[0]; |
6439 | | |
6440 | | // stage 2 |
6441 | 0 | cospi = cospi_arr(cos_bit); |
6442 | 0 | bf0 = output; |
6443 | 0 | bf1 = step; |
6444 | 0 | bf1[0] = bf0[0] + bf0[3]; |
6445 | 0 | bf1[1] = bf0[1] + bf0[2]; |
6446 | 0 | bf1[4] = bf0[4]; |
6447 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
6448 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
6449 | 0 | bf1[7] = bf0[7]; |
6450 | | |
6451 | | // stage 3 |
6452 | 0 | bf0 = step; |
6453 | 0 | bf1 = output; |
6454 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
6455 | 0 | bf1[4] = bf0[4] + bf0[5]; |
6456 | 0 | bf1[7] = bf0[7] + bf0[6]; |
6457 | | |
6458 | | // stage 4 |
6459 | 0 | bf0 = output; |
6460 | 0 | bf1 = step; |
6461 | 0 | bf1[0] = bf0[0]; |
6462 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
6463 | | |
6464 | | // stage 5 |
6465 | 0 | bf0 = step; |
6466 | 0 | bf1 = output; |
6467 | 0 | bf1[0] = bf0[0]; |
6468 | 0 | bf1[1] = bf0[4]; |
6469 | 0 | } |
6470 | | |
6471 | 0 | void svt_av1_fadst8_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6472 | 0 | (void)stage_range; |
6473 | 0 | const int32_t* cospi; |
6474 | |
|
6475 | 0 | int32_t *bf0, *bf1; |
6476 | 0 | int32_t step[8]; |
6477 | | |
6478 | | // stage 0; |
6479 | | |
6480 | | // stage 1; |
6481 | 0 | assert(output != input); |
6482 | 0 | bf1 = output; |
6483 | 0 | bf1[0] = input[0]; |
6484 | 0 | bf1[1] = -input[7]; |
6485 | 0 | bf1[2] = -input[3]; |
6486 | 0 | bf1[3] = input[4]; |
6487 | 0 | bf1[4] = -input[1]; |
6488 | 0 | bf1[5] = input[6]; |
6489 | 0 | bf1[6] = input[2]; |
6490 | 0 | bf1[7] = -input[5]; |
6491 | | |
6492 | | // stage 2 |
6493 | 0 | cospi = cospi_arr(cos_bit); |
6494 | 0 | bf0 = output; |
6495 | 0 | bf1 = step; |
6496 | 0 | bf1[0] = bf0[0]; |
6497 | 0 | bf1[1] = bf0[1]; |
6498 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
6499 | 0 | bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); |
6500 | 0 | bf1[4] = bf0[4]; |
6501 | 0 | bf1[5] = bf0[5]; |
6502 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
6503 | 0 | bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); |
6504 | | |
6505 | | // stage 3 |
6506 | 0 | bf0 = step; |
6507 | 0 | bf1 = output; |
6508 | 0 | bf1[0] = bf0[0] + bf0[2]; |
6509 | 0 | bf1[1] = bf0[1] + bf0[3]; |
6510 | 0 | bf1[2] = bf0[0] - bf0[2]; |
6511 | 0 | bf1[3] = bf0[1] - bf0[3]; |
6512 | 0 | bf1[4] = bf0[4] + bf0[6]; |
6513 | 0 | bf1[5] = bf0[5] + bf0[7]; |
6514 | 0 | bf1[6] = bf0[4] - bf0[6]; |
6515 | 0 | bf1[7] = bf0[5] - bf0[7]; |
6516 | | |
6517 | | // stage 4 |
6518 | 0 | bf0 = output; |
6519 | 0 | bf1 = step; |
6520 | 0 | bf1[0] = bf0[0]; |
6521 | 0 | bf1[1] = bf0[1]; |
6522 | 0 | bf1[2] = bf0[2]; |
6523 | 0 | bf1[3] = bf0[3]; |
6524 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
6525 | 0 | bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); |
6526 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
6527 | 0 | bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); |
6528 | | |
6529 | | // stage 5 |
6530 | 0 | bf0 = step; |
6531 | 0 | bf1 = output; |
6532 | 0 | bf1[0] = bf0[0] + bf0[4]; |
6533 | 0 | bf1[1] = bf0[1] + bf0[5]; |
6534 | 0 | bf1[6] = bf0[2] - bf0[6]; |
6535 | 0 | bf1[7] = bf0[3] - bf0[7]; |
6536 | | |
6537 | | // stage 6 |
6538 | 0 | bf0 = output; |
6539 | 0 | bf1 = step; |
6540 | 0 | bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); |
6541 | 0 | bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); |
6542 | | |
6543 | | // stage 7 |
6544 | 0 | bf0 = step; |
6545 | 0 | bf1 = output; |
6546 | 0 | bf1[0] = bf0[1]; |
6547 | 0 | bf1[1] = bf0[6]; |
6548 | 0 | } |
6549 | | |
6550 | 0 | void svt_av1_fidentity8_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6551 | 0 | (void)stage_range; |
6552 | 0 | (void)cos_bit; |
6553 | 0 | for (int32_t i = 0; i < 2; ++i) { |
6554 | 0 | output[i] = input[i] * 2; |
6555 | 0 | } |
6556 | 0 | } |
6557 | | |
6558 | 0 | void svt_av1_fdct16_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6559 | 0 | (void)stage_range; |
6560 | 0 | const int32_t* cospi; |
6561 | |
|
6562 | 0 | int32_t *bf0, *bf1; |
6563 | 0 | int32_t step[16]; |
6564 | | |
6565 | | // stage 0; |
6566 | | |
6567 | | // stage 1; |
6568 | 0 | bf1 = output; |
6569 | 0 | bf1[0] = input[0] + input[15]; |
6570 | 0 | bf1[1] = input[1] + input[14]; |
6571 | 0 | bf1[2] = input[2] + input[13]; |
6572 | 0 | bf1[3] = input[3] + input[12]; |
6573 | 0 | bf1[4] = input[4] + input[11]; |
6574 | 0 | bf1[5] = input[5] + input[10]; |
6575 | 0 | bf1[6] = input[6] + input[9]; |
6576 | 0 | bf1[7] = input[7] + input[8]; |
6577 | 0 | bf1[8] = -input[8] + input[7]; |
6578 | 0 | bf1[9] = -input[9] + input[6]; |
6579 | 0 | bf1[10] = -input[10] + input[5]; |
6580 | 0 | bf1[11] = -input[11] + input[4]; |
6581 | 0 | bf1[12] = -input[12] + input[3]; |
6582 | 0 | bf1[13] = -input[13] + input[2]; |
6583 | 0 | bf1[14] = -input[14] + input[1]; |
6584 | 0 | bf1[15] = -input[15] + input[0]; |
6585 | | |
6586 | | // stage 2 |
6587 | 0 | cospi = cospi_arr(cos_bit); |
6588 | 0 | bf0 = output; |
6589 | 0 | bf1 = step; |
6590 | 0 | bf1[0] = bf0[0] + bf0[7]; |
6591 | 0 | bf1[1] = bf0[1] + bf0[6]; |
6592 | 0 | bf1[2] = bf0[2] + bf0[5]; |
6593 | 0 | bf1[3] = bf0[3] + bf0[4]; |
6594 | 0 | bf1[4] = -bf0[4] + bf0[3]; |
6595 | 0 | bf1[5] = -bf0[5] + bf0[2]; |
6596 | 0 | bf1[6] = -bf0[6] + bf0[1]; |
6597 | 0 | bf1[7] = -bf0[7] + bf0[0]; |
6598 | 0 | bf1[8] = bf0[8]; |
6599 | 0 | bf1[9] = bf0[9]; |
6600 | 0 | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
6601 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
6602 | 0 | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
6603 | 0 | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
6604 | 0 | bf1[14] = bf0[14]; |
6605 | 0 | bf1[15] = bf0[15]; |
6606 | | |
6607 | | // stage 3 |
6608 | 0 | bf0 = step; |
6609 | 0 | bf1 = output; |
6610 | 0 | bf1[0] = bf0[0] + bf0[3]; |
6611 | 0 | bf1[1] = bf0[1] + bf0[2]; |
6612 | 0 | bf1[4] = bf0[4]; |
6613 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
6614 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
6615 | 0 | bf1[7] = bf0[7]; |
6616 | 0 | bf1[8] = bf0[8] + bf0[11]; |
6617 | 0 | bf1[9] = bf0[9] + bf0[10]; |
6618 | 0 | bf1[10] = -bf0[10] + bf0[9]; |
6619 | 0 | bf1[11] = -bf0[11] + bf0[8]; |
6620 | 0 | bf1[12] = -bf0[12] + bf0[15]; |
6621 | 0 | bf1[13] = -bf0[13] + bf0[14]; |
6622 | 0 | bf1[14] = bf0[14] + bf0[13]; |
6623 | 0 | bf1[15] = bf0[15] + bf0[12]; |
6624 | | |
6625 | | // stage 4 |
6626 | 0 | bf0 = output; |
6627 | 0 | bf1 = step; |
6628 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
6629 | 0 | bf1[4] = bf0[4] + bf0[5]; |
6630 | 0 | bf1[7] = bf0[7] + bf0[6]; |
6631 | 0 | bf1[8] = bf0[8]; |
6632 | 0 | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
6633 | 0 | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
6634 | 0 | bf1[11] = bf0[11]; |
6635 | 0 | bf1[12] = bf0[12]; |
6636 | 0 | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
6637 | 0 | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
6638 | 0 | bf1[15] = bf0[15]; |
6639 | | |
6640 | | // stage 5 |
6641 | 0 | bf0 = step; |
6642 | 0 | bf1 = output; |
6643 | 0 | bf1[0] = bf0[0]; |
6644 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
6645 | 0 | bf1[8] = bf0[8] + bf0[9]; |
6646 | 0 | bf1[11] = bf0[11] + bf0[10]; |
6647 | 0 | bf1[12] = bf0[12] + bf0[13]; |
6648 | 0 | bf1[15] = bf0[15] + bf0[14]; |
6649 | | |
6650 | | // stage 6 |
6651 | 0 | bf0 = output; |
6652 | 0 | bf1 = step; |
6653 | 0 | bf1[0] = bf0[0]; |
6654 | 0 | bf1[4] = bf0[4]; |
6655 | 0 | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
6656 | 0 | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
6657 | | |
6658 | | // stage 7 |
6659 | 0 | bf0 = step; |
6660 | 0 | bf1 = output; |
6661 | 0 | bf1[0] = bf0[0]; |
6662 | 0 | bf1[1] = bf0[8]; |
6663 | 0 | bf1[2] = bf0[4]; |
6664 | 0 | bf1[3] = bf0[12]; |
6665 | 0 | } |
6666 | | |
6667 | 0 | void svt_av1_fadst16_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6668 | 0 | (void)stage_range; |
6669 | 0 | const int32_t* cospi; |
6670 | |
|
6671 | 0 | int32_t *bf0, *bf1; |
6672 | 0 | int32_t step[16]; |
6673 | | |
6674 | | // stage 0; |
6675 | | |
6676 | | // stage 1; |
6677 | 0 | assert(output != input); |
6678 | 0 | bf1 = output; |
6679 | 0 | bf1[0] = input[0]; |
6680 | 0 | bf1[1] = -input[15]; |
6681 | 0 | bf1[2] = -input[7]; |
6682 | 0 | bf1[3] = input[8]; |
6683 | 0 | bf1[4] = -input[3]; |
6684 | 0 | bf1[5] = input[12]; |
6685 | 0 | bf1[6] = input[4]; |
6686 | 0 | bf1[7] = -input[11]; |
6687 | 0 | bf1[8] = -input[1]; |
6688 | 0 | bf1[9] = input[14]; |
6689 | 0 | bf1[10] = input[6]; |
6690 | 0 | bf1[11] = -input[9]; |
6691 | 0 | bf1[12] = input[2]; |
6692 | 0 | bf1[13] = -input[13]; |
6693 | 0 | bf1[14] = -input[5]; |
6694 | 0 | bf1[15] = input[10]; |
6695 | | |
6696 | | // stage 2 |
6697 | 0 | cospi = cospi_arr(cos_bit); |
6698 | 0 | bf0 = output; |
6699 | 0 | bf1 = step; |
6700 | 0 | bf1[0] = bf0[0]; |
6701 | 0 | bf1[1] = bf0[1]; |
6702 | 0 | bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); |
6703 | 0 | bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); |
6704 | 0 | bf1[4] = bf0[4]; |
6705 | 0 | bf1[5] = bf0[5]; |
6706 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); |
6707 | 0 | bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); |
6708 | 0 | bf1[8] = bf0[8]; |
6709 | 0 | bf1[9] = bf0[9]; |
6710 | 0 | bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); |
6711 | 0 | bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); |
6712 | 0 | bf1[12] = bf0[12]; |
6713 | 0 | bf1[13] = bf0[13]; |
6714 | 0 | bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); |
6715 | 0 | bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); |
6716 | | |
6717 | | // stage 3 |
6718 | 0 | bf0 = step; |
6719 | 0 | bf1 = output; |
6720 | 0 | bf1[0] = bf0[0] + bf0[2]; |
6721 | 0 | bf1[1] = bf0[1] + bf0[3]; |
6722 | 0 | bf1[2] = bf0[0] - bf0[2]; |
6723 | 0 | bf1[3] = bf0[1] - bf0[3]; |
6724 | 0 | bf1[4] = bf0[4] + bf0[6]; |
6725 | 0 | bf1[5] = bf0[5] + bf0[7]; |
6726 | 0 | bf1[6] = bf0[4] - bf0[6]; |
6727 | 0 | bf1[7] = bf0[5] - bf0[7]; |
6728 | 0 | bf1[8] = bf0[8] + bf0[10]; |
6729 | 0 | bf1[9] = bf0[9] + bf0[11]; |
6730 | 0 | bf1[10] = bf0[8] - bf0[10]; |
6731 | 0 | bf1[11] = bf0[9] - bf0[11]; |
6732 | 0 | bf1[12] = bf0[12] + bf0[14]; |
6733 | 0 | bf1[13] = bf0[13] + bf0[15]; |
6734 | 0 | bf1[14] = bf0[12] - bf0[14]; |
6735 | 0 | bf1[15] = bf0[13] - bf0[15]; |
6736 | | |
6737 | | // stage 4 |
6738 | 0 | bf0 = output; |
6739 | 0 | bf1 = step; |
6740 | 0 | bf1[0] = bf0[0]; |
6741 | 0 | bf1[1] = bf0[1]; |
6742 | 0 | bf1[2] = bf0[2]; |
6743 | 0 | bf1[3] = bf0[3]; |
6744 | 0 | bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); |
6745 | 0 | bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); |
6746 | 0 | bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); |
6747 | 0 | bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); |
6748 | 0 | bf1[8] = bf0[8]; |
6749 | 0 | bf1[9] = bf0[9]; |
6750 | 0 | bf1[10] = bf0[10]; |
6751 | 0 | bf1[11] = bf0[11]; |
6752 | 0 | bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); |
6753 | 0 | bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); |
6754 | 0 | bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); |
6755 | 0 | bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); |
6756 | | |
6757 | | // stage 5 |
6758 | 0 | bf0 = step; |
6759 | 0 | bf1 = output; |
6760 | 0 | bf1[0] = bf0[0] + bf0[4]; |
6761 | 0 | bf1[1] = bf0[1] + bf0[5]; |
6762 | 0 | bf1[2] = bf0[2] + bf0[6]; |
6763 | 0 | bf1[3] = bf0[3] + bf0[7]; |
6764 | 0 | bf1[4] = bf0[0] - bf0[4]; |
6765 | 0 | bf1[5] = bf0[1] - bf0[5]; |
6766 | 0 | bf1[6] = bf0[2] - bf0[6]; |
6767 | 0 | bf1[7] = bf0[3] - bf0[7]; |
6768 | 0 | bf1[8] = bf0[8] + bf0[12]; |
6769 | 0 | bf1[9] = bf0[9] + bf0[13]; |
6770 | 0 | bf1[10] = bf0[10] + bf0[14]; |
6771 | 0 | bf1[11] = bf0[11] + bf0[15]; |
6772 | 0 | bf1[12] = bf0[8] - bf0[12]; |
6773 | 0 | bf1[13] = bf0[9] - bf0[13]; |
6774 | 0 | bf1[14] = bf0[10] - bf0[14]; |
6775 | 0 | bf1[15] = bf0[11] - bf0[15]; |
6776 | | |
6777 | | // stage 6 |
6778 | 0 | bf0 = output; |
6779 | 0 | bf1 = step; |
6780 | 0 | bf1[0] = bf0[0]; |
6781 | 0 | bf1[1] = bf0[1]; |
6782 | 0 | bf1[2] = bf0[2]; |
6783 | 0 | bf1[3] = bf0[3]; |
6784 | 0 | bf1[4] = bf0[4]; |
6785 | 0 | bf1[5] = bf0[5]; |
6786 | 0 | bf1[6] = bf0[6]; |
6787 | 0 | bf1[7] = bf0[7]; |
6788 | 0 | bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); |
6789 | 0 | bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); |
6790 | 0 | bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); |
6791 | 0 | bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); |
6792 | 0 | bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); |
6793 | 0 | bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); |
6794 | 0 | bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); |
6795 | 0 | bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); |
6796 | | |
6797 | | // stage 7 |
6798 | 0 | bf0 = step; |
6799 | 0 | bf1 = output; |
6800 | 0 | bf1[0] = bf0[0] + bf0[8]; |
6801 | 0 | bf1[1] = bf0[1] + bf0[9]; |
6802 | 0 | bf1[2] = bf0[2] + bf0[10]; |
6803 | 0 | bf1[3] = bf0[3] + bf0[11]; |
6804 | 0 | bf1[12] = bf0[4] - bf0[12]; |
6805 | 0 | bf1[13] = bf0[5] - bf0[13]; |
6806 | 0 | bf1[14] = bf0[6] - bf0[14]; |
6807 | 0 | bf1[15] = bf0[7] - bf0[15]; |
6808 | | |
6809 | | // stage 8 |
6810 | 0 | bf0 = output; |
6811 | 0 | bf1 = step; |
6812 | 0 | bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); |
6813 | 0 | bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); |
6814 | 0 | bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); |
6815 | 0 | bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); |
6816 | | |
6817 | | // stage 9 |
6818 | 0 | bf0 = step; |
6819 | 0 | bf1 = output; |
6820 | 0 | bf1[0] = bf0[1]; |
6821 | 0 | bf1[1] = bf0[14]; |
6822 | 0 | bf1[2] = bf0[3]; |
6823 | 0 | bf1[3] = bf0[12]; |
6824 | 0 | } |
6825 | | |
6826 | 0 | void svt_av1_fidentity16_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6827 | 0 | (void)stage_range; |
6828 | 0 | (void)cos_bit; |
6829 | 0 | for (int32_t i = 0; i < 4; ++i) { |
6830 | 0 | output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits); |
6831 | 0 | } |
6832 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
6833 | 0 | } |
6834 | | |
6835 | 0 | void svt_av1_fdct32_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
6836 | 0 | (void)stage_range; |
6837 | 0 | const int32_t* cospi; |
6838 | |
|
6839 | 0 | int32_t *bf0, *bf1; |
6840 | 0 | int32_t step[32]; |
6841 | | |
6842 | | // stage 0; |
6843 | | |
6844 | | // stage 1; |
6845 | 0 | bf1 = output; |
6846 | 0 | bf1[0] = input[0] + input[31]; |
6847 | 0 | bf1[1] = input[1] + input[30]; |
6848 | 0 | bf1[2] = input[2] + input[29]; |
6849 | 0 | bf1[3] = input[3] + input[28]; |
6850 | 0 | bf1[4] = input[4] + input[27]; |
6851 | 0 | bf1[5] = input[5] + input[26]; |
6852 | 0 | bf1[6] = input[6] + input[25]; |
6853 | 0 | bf1[7] = input[7] + input[24]; |
6854 | 0 | bf1[8] = input[8] + input[23]; |
6855 | 0 | bf1[9] = input[9] + input[22]; |
6856 | 0 | bf1[10] = input[10] + input[21]; |
6857 | 0 | bf1[11] = input[11] + input[20]; |
6858 | 0 | bf1[12] = input[12] + input[19]; |
6859 | 0 | bf1[13] = input[13] + input[18]; |
6860 | 0 | bf1[14] = input[14] + input[17]; |
6861 | 0 | bf1[15] = input[15] + input[16]; |
6862 | 0 | bf1[16] = -input[16] + input[15]; |
6863 | 0 | bf1[17] = -input[17] + input[14]; |
6864 | 0 | bf1[18] = -input[18] + input[13]; |
6865 | 0 | bf1[19] = -input[19] + input[12]; |
6866 | 0 | bf1[20] = -input[20] + input[11]; |
6867 | 0 | bf1[21] = -input[21] + input[10]; |
6868 | 0 | bf1[22] = -input[22] + input[9]; |
6869 | 0 | bf1[23] = -input[23] + input[8]; |
6870 | 0 | bf1[24] = -input[24] + input[7]; |
6871 | 0 | bf1[25] = -input[25] + input[6]; |
6872 | 0 | bf1[26] = -input[26] + input[5]; |
6873 | 0 | bf1[27] = -input[27] + input[4]; |
6874 | 0 | bf1[28] = -input[28] + input[3]; |
6875 | 0 | bf1[29] = -input[29] + input[2]; |
6876 | 0 | bf1[30] = -input[30] + input[1]; |
6877 | 0 | bf1[31] = -input[31] + input[0]; |
6878 | | |
6879 | | // stage 2 |
6880 | 0 | cospi = cospi_arr(cos_bit); |
6881 | 0 | bf0 = output; |
6882 | 0 | bf1 = step; |
6883 | 0 | bf1[0] = bf0[0] + bf0[15]; |
6884 | 0 | bf1[1] = bf0[1] + bf0[14]; |
6885 | 0 | bf1[2] = bf0[2] + bf0[13]; |
6886 | 0 | bf1[3] = bf0[3] + bf0[12]; |
6887 | 0 | bf1[4] = bf0[4] + bf0[11]; |
6888 | 0 | bf1[5] = bf0[5] + bf0[10]; |
6889 | 0 | bf1[6] = bf0[6] + bf0[9]; |
6890 | 0 | bf1[7] = bf0[7] + bf0[8]; |
6891 | 0 | bf1[8] = -bf0[8] + bf0[7]; |
6892 | 0 | bf1[9] = -bf0[9] + bf0[6]; |
6893 | 0 | bf1[10] = -bf0[10] + bf0[5]; |
6894 | 0 | bf1[11] = -bf0[11] + bf0[4]; |
6895 | 0 | bf1[12] = -bf0[12] + bf0[3]; |
6896 | 0 | bf1[13] = -bf0[13] + bf0[2]; |
6897 | 0 | bf1[14] = -bf0[14] + bf0[1]; |
6898 | 0 | bf1[15] = -bf0[15] + bf0[0]; |
6899 | 0 | bf1[16] = bf0[16]; |
6900 | 0 | bf1[17] = bf0[17]; |
6901 | 0 | bf1[18] = bf0[18]; |
6902 | 0 | bf1[19] = bf0[19]; |
6903 | 0 | bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); |
6904 | 0 | bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); |
6905 | 0 | bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); |
6906 | 0 | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); |
6907 | 0 | bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); |
6908 | 0 | bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); |
6909 | 0 | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); |
6910 | 0 | bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); |
6911 | 0 | bf1[28] = bf0[28]; |
6912 | 0 | bf1[29] = bf0[29]; |
6913 | 0 | bf1[30] = bf0[30]; |
6914 | 0 | bf1[31] = bf0[31]; |
6915 | | |
6916 | | // stage 3 |
6917 | 0 | bf0 = step; |
6918 | 0 | bf1 = output; |
6919 | 0 | bf1[0] = bf0[0] + bf0[7]; |
6920 | 0 | bf1[1] = bf0[1] + bf0[6]; |
6921 | 0 | bf1[2] = bf0[2] + bf0[5]; |
6922 | 0 | bf1[3] = bf0[3] + bf0[4]; |
6923 | 0 | bf1[4] = -bf0[4] + bf0[3]; |
6924 | 0 | bf1[5] = -bf0[5] + bf0[2]; |
6925 | 0 | bf1[6] = -bf0[6] + bf0[1]; |
6926 | 0 | bf1[7] = -bf0[7] + bf0[0]; |
6927 | 0 | bf1[8] = bf0[8]; |
6928 | 0 | bf1[9] = bf0[9]; |
6929 | 0 | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
6930 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
6931 | 0 | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
6932 | 0 | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
6933 | 0 | bf1[14] = bf0[14]; |
6934 | 0 | bf1[15] = bf0[15]; |
6935 | 0 | bf1[16] = bf0[16] + bf0[23]; |
6936 | 0 | bf1[17] = bf0[17] + bf0[22]; |
6937 | 0 | bf1[18] = bf0[18] + bf0[21]; |
6938 | 0 | bf1[19] = bf0[19] + bf0[20]; |
6939 | 0 | bf1[20] = -bf0[20] + bf0[19]; |
6940 | 0 | bf1[21] = -bf0[21] + bf0[18]; |
6941 | 0 | bf1[22] = -bf0[22] + bf0[17]; |
6942 | 0 | bf1[23] = -bf0[23] + bf0[16]; |
6943 | 0 | bf1[24] = -bf0[24] + bf0[31]; |
6944 | 0 | bf1[25] = -bf0[25] + bf0[30]; |
6945 | 0 | bf1[26] = -bf0[26] + bf0[29]; |
6946 | 0 | bf1[27] = -bf0[27] + bf0[28]; |
6947 | 0 | bf1[28] = bf0[28] + bf0[27]; |
6948 | 0 | bf1[29] = bf0[29] + bf0[26]; |
6949 | 0 | bf1[30] = bf0[30] + bf0[25]; |
6950 | 0 | bf1[31] = bf0[31] + bf0[24]; |
6951 | | |
6952 | | // stage 4 |
6953 | 0 | bf0 = output; |
6954 | 0 | bf1 = step; |
6955 | 0 | bf1[0] = bf0[0] + bf0[3]; |
6956 | 0 | bf1[1] = bf0[1] + bf0[2]; |
6957 | 0 | bf1[4] = bf0[4]; |
6958 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
6959 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
6960 | 0 | bf1[7] = bf0[7]; |
6961 | 0 | bf1[8] = bf0[8] + bf0[11]; |
6962 | 0 | bf1[9] = bf0[9] + bf0[10]; |
6963 | 0 | bf1[10] = -bf0[10] + bf0[9]; |
6964 | 0 | bf1[11] = -bf0[11] + bf0[8]; |
6965 | 0 | bf1[12] = -bf0[12] + bf0[15]; |
6966 | 0 | bf1[13] = -bf0[13] + bf0[14]; |
6967 | 0 | bf1[14] = bf0[14] + bf0[13]; |
6968 | 0 | bf1[15] = bf0[15] + bf0[12]; |
6969 | 0 | bf1[16] = bf0[16]; |
6970 | 0 | bf1[17] = bf0[17]; |
6971 | 0 | bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); |
6972 | 0 | bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); |
6973 | 0 | bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); |
6974 | 0 | bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); |
6975 | 0 | bf1[22] = bf0[22]; |
6976 | 0 | bf1[23] = bf0[23]; |
6977 | 0 | bf1[24] = bf0[24]; |
6978 | 0 | bf1[25] = bf0[25]; |
6979 | 0 | bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); |
6980 | 0 | bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); |
6981 | 0 | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); |
6982 | 0 | bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); |
6983 | 0 | bf1[30] = bf0[30]; |
6984 | 0 | bf1[31] = bf0[31]; |
6985 | | |
6986 | | // stage 5 |
6987 | 0 | bf0 = step; |
6988 | 0 | bf1 = output; |
6989 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
6990 | 0 | bf1[4] = bf0[4] + bf0[5]; |
6991 | 0 | bf1[7] = bf0[7] + bf0[6]; |
6992 | 0 | bf1[8] = bf0[8]; |
6993 | 0 | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
6994 | 0 | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
6995 | 0 | bf1[11] = bf0[11]; |
6996 | 0 | bf1[12] = bf0[12]; |
6997 | 0 | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
6998 | 0 | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
6999 | 0 | bf1[15] = bf0[15]; |
7000 | 0 | bf1[16] = bf0[16] + bf0[19]; |
7001 | 0 | bf1[17] = bf0[17] + bf0[18]; |
7002 | 0 | bf1[18] = -bf0[18] + bf0[17]; |
7003 | 0 | bf1[19] = -bf0[19] + bf0[16]; |
7004 | 0 | bf1[20] = -bf0[20] + bf0[23]; |
7005 | 0 | bf1[21] = -bf0[21] + bf0[22]; |
7006 | 0 | bf1[22] = bf0[22] + bf0[21]; |
7007 | 0 | bf1[23] = bf0[23] + bf0[20]; |
7008 | 0 | bf1[24] = bf0[24] + bf0[27]; |
7009 | 0 | bf1[25] = bf0[25] + bf0[26]; |
7010 | 0 | bf1[26] = -bf0[26] + bf0[25]; |
7011 | 0 | bf1[27] = -bf0[27] + bf0[24]; |
7012 | 0 | bf1[28] = -bf0[28] + bf0[31]; |
7013 | 0 | bf1[29] = -bf0[29] + bf0[30]; |
7014 | 0 | bf1[30] = bf0[30] + bf0[29]; |
7015 | 0 | bf1[31] = bf0[31] + bf0[28]; |
7016 | | |
7017 | | // stage 6 |
7018 | 0 | bf0 = output; |
7019 | 0 | bf1 = step; |
7020 | 0 | bf1[0] = bf0[0]; |
7021 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
7022 | 0 | bf1[8] = bf0[8] + bf0[9]; |
7023 | 0 | bf1[11] = bf0[11] + bf0[10]; |
7024 | 0 | bf1[12] = bf0[12] + bf0[13]; |
7025 | 0 | bf1[15] = bf0[15] + bf0[14]; |
7026 | 0 | bf1[16] = bf0[16]; |
7027 | 0 | bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); |
7028 | 0 | bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); |
7029 | 0 | bf1[19] = bf0[19]; |
7030 | 0 | bf1[20] = bf0[20]; |
7031 | 0 | bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); |
7032 | 0 | bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); |
7033 | 0 | bf1[23] = bf0[23]; |
7034 | 0 | bf1[24] = bf0[24]; |
7035 | 0 | bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); |
7036 | 0 | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); |
7037 | 0 | bf1[27] = bf0[27]; |
7038 | 0 | bf1[28] = bf0[28]; |
7039 | 0 | bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); |
7040 | 0 | bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); |
7041 | 0 | bf1[31] = bf0[31]; |
7042 | | |
7043 | | // stage 7 |
7044 | 0 | bf0 = step; |
7045 | 0 | bf1 = output; |
7046 | 0 | bf1[0] = bf0[0]; |
7047 | 0 | bf1[4] = bf0[4]; |
7048 | 0 | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
7049 | 0 | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
7050 | 0 | bf1[16] = bf0[16] + bf0[17]; |
7051 | 0 | bf1[19] = bf0[19] + bf0[18]; |
7052 | 0 | bf1[20] = bf0[20] + bf0[21]; |
7053 | 0 | bf1[23] = bf0[23] + bf0[22]; |
7054 | 0 | bf1[24] = bf0[24] + bf0[25]; |
7055 | 0 | bf1[27] = bf0[27] + bf0[26]; |
7056 | 0 | bf1[28] = bf0[28] + bf0[29]; |
7057 | 0 | bf1[31] = bf0[31] + bf0[30]; |
7058 | | |
7059 | | // stage 8 |
7060 | 0 | bf0 = output; |
7061 | 0 | bf1 = step; |
7062 | 0 | bf1[0] = bf0[0]; |
7063 | 0 | bf1[4] = bf0[4]; |
7064 | 0 | bf1[8] = bf0[8]; |
7065 | 0 | bf1[12] = bf0[12]; |
7066 | 0 | bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); |
7067 | 0 | bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); |
7068 | 0 | bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); |
7069 | 0 | bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); |
7070 | | |
7071 | | // stage 9 |
7072 | 0 | bf0 = step; |
7073 | 0 | bf1 = output; |
7074 | 0 | bf1[0] = bf0[0]; |
7075 | 0 | bf1[1] = bf0[16]; |
7076 | 0 | bf1[2] = bf0[8]; |
7077 | 0 | bf1[3] = bf0[24]; |
7078 | 0 | bf1[4] = bf0[4]; |
7079 | 0 | bf1[5] = bf0[20]; |
7080 | 0 | bf1[6] = bf0[12]; |
7081 | 0 | bf1[7] = bf0[28]; |
7082 | 0 | } |
7083 | | |
7084 | 0 | void svt_av1_fidentity32_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
7085 | 0 | (void)stage_range; |
7086 | 0 | (void)cos_bit; |
7087 | 0 | for (int32_t i = 0; i < 8; ++i) { |
7088 | 0 | output[i] = input[i] * 4; |
7089 | 0 | } |
7090 | 0 | } |
7091 | | |
7092 | 0 | void svt_av1_fdct64_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
7093 | 0 | (void)stage_range; |
7094 | 0 | const int32_t* cospi; |
7095 | |
|
7096 | 0 | int32_t *bf0, *bf1; |
7097 | 0 | int32_t step[64]; |
7098 | | |
7099 | | // stage 0; |
7100 | | |
7101 | | // stage 1; |
7102 | 0 | bf1 = output; |
7103 | 0 | bf1[0] = input[0] + input[63]; |
7104 | 0 | bf1[1] = input[1] + input[62]; |
7105 | 0 | bf1[2] = input[2] + input[61]; |
7106 | 0 | bf1[3] = input[3] + input[60]; |
7107 | 0 | bf1[4] = input[4] + input[59]; |
7108 | 0 | bf1[5] = input[5] + input[58]; |
7109 | 0 | bf1[6] = input[6] + input[57]; |
7110 | 0 | bf1[7] = input[7] + input[56]; |
7111 | 0 | bf1[8] = input[8] + input[55]; |
7112 | 0 | bf1[9] = input[9] + input[54]; |
7113 | 0 | bf1[10] = input[10] + input[53]; |
7114 | 0 | bf1[11] = input[11] + input[52]; |
7115 | 0 | bf1[12] = input[12] + input[51]; |
7116 | 0 | bf1[13] = input[13] + input[50]; |
7117 | 0 | bf1[14] = input[14] + input[49]; |
7118 | 0 | bf1[15] = input[15] + input[48]; |
7119 | 0 | bf1[16] = input[16] + input[47]; |
7120 | 0 | bf1[17] = input[17] + input[46]; |
7121 | 0 | bf1[18] = input[18] + input[45]; |
7122 | 0 | bf1[19] = input[19] + input[44]; |
7123 | 0 | bf1[20] = input[20] + input[43]; |
7124 | 0 | bf1[21] = input[21] + input[42]; |
7125 | 0 | bf1[22] = input[22] + input[41]; |
7126 | 0 | bf1[23] = input[23] + input[40]; |
7127 | 0 | bf1[24] = input[24] + input[39]; |
7128 | 0 | bf1[25] = input[25] + input[38]; |
7129 | 0 | bf1[26] = input[26] + input[37]; |
7130 | 0 | bf1[27] = input[27] + input[36]; |
7131 | 0 | bf1[28] = input[28] + input[35]; |
7132 | 0 | bf1[29] = input[29] + input[34]; |
7133 | 0 | bf1[30] = input[30] + input[33]; |
7134 | 0 | bf1[31] = input[31] + input[32]; |
7135 | 0 | bf1[32] = -input[32] + input[31]; |
7136 | 0 | bf1[33] = -input[33] + input[30]; |
7137 | 0 | bf1[34] = -input[34] + input[29]; |
7138 | 0 | bf1[35] = -input[35] + input[28]; |
7139 | 0 | bf1[36] = -input[36] + input[27]; |
7140 | 0 | bf1[37] = -input[37] + input[26]; |
7141 | 0 | bf1[38] = -input[38] + input[25]; |
7142 | 0 | bf1[39] = -input[39] + input[24]; |
7143 | 0 | bf1[40] = -input[40] + input[23]; |
7144 | 0 | bf1[41] = -input[41] + input[22]; |
7145 | 0 | bf1[42] = -input[42] + input[21]; |
7146 | 0 | bf1[43] = -input[43] + input[20]; |
7147 | 0 | bf1[44] = -input[44] + input[19]; |
7148 | 0 | bf1[45] = -input[45] + input[18]; |
7149 | 0 | bf1[46] = -input[46] + input[17]; |
7150 | 0 | bf1[47] = -input[47] + input[16]; |
7151 | 0 | bf1[48] = -input[48] + input[15]; |
7152 | 0 | bf1[49] = -input[49] + input[14]; |
7153 | 0 | bf1[50] = -input[50] + input[13]; |
7154 | 0 | bf1[51] = -input[51] + input[12]; |
7155 | 0 | bf1[52] = -input[52] + input[11]; |
7156 | 0 | bf1[53] = -input[53] + input[10]; |
7157 | 0 | bf1[54] = -input[54] + input[9]; |
7158 | 0 | bf1[55] = -input[55] + input[8]; |
7159 | 0 | bf1[56] = -input[56] + input[7]; |
7160 | 0 | bf1[57] = -input[57] + input[6]; |
7161 | 0 | bf1[58] = -input[58] + input[5]; |
7162 | 0 | bf1[59] = -input[59] + input[4]; |
7163 | 0 | bf1[60] = -input[60] + input[3]; |
7164 | 0 | bf1[61] = -input[61] + input[2]; |
7165 | 0 | bf1[62] = -input[62] + input[1]; |
7166 | 0 | bf1[63] = -input[63] + input[0]; |
7167 | | |
7168 | | // stage 2 |
7169 | 0 | cospi = cospi_arr(cos_bit); |
7170 | 0 | bf0 = output; |
7171 | 0 | bf1 = step; |
7172 | 0 | bf1[0] = bf0[0] + bf0[31]; |
7173 | 0 | bf1[1] = bf0[1] + bf0[30]; |
7174 | 0 | bf1[2] = bf0[2] + bf0[29]; |
7175 | 0 | bf1[3] = bf0[3] + bf0[28]; |
7176 | 0 | bf1[4] = bf0[4] + bf0[27]; |
7177 | 0 | bf1[5] = bf0[5] + bf0[26]; |
7178 | 0 | bf1[6] = bf0[6] + bf0[25]; |
7179 | 0 | bf1[7] = bf0[7] + bf0[24]; |
7180 | 0 | bf1[8] = bf0[8] + bf0[23]; |
7181 | 0 | bf1[9] = bf0[9] + bf0[22]; |
7182 | 0 | bf1[10] = bf0[10] + bf0[21]; |
7183 | 0 | bf1[11] = bf0[11] + bf0[20]; |
7184 | 0 | bf1[12] = bf0[12] + bf0[19]; |
7185 | 0 | bf1[13] = bf0[13] + bf0[18]; |
7186 | 0 | bf1[14] = bf0[14] + bf0[17]; |
7187 | 0 | bf1[15] = bf0[15] + bf0[16]; |
7188 | 0 | bf1[16] = -bf0[16] + bf0[15]; |
7189 | 0 | bf1[17] = -bf0[17] + bf0[14]; |
7190 | 0 | bf1[18] = -bf0[18] + bf0[13]; |
7191 | 0 | bf1[19] = -bf0[19] + bf0[12]; |
7192 | 0 | bf1[20] = -bf0[20] + bf0[11]; |
7193 | 0 | bf1[21] = -bf0[21] + bf0[10]; |
7194 | 0 | bf1[22] = -bf0[22] + bf0[9]; |
7195 | 0 | bf1[23] = -bf0[23] + bf0[8]; |
7196 | 0 | bf1[24] = -bf0[24] + bf0[7]; |
7197 | 0 | bf1[25] = -bf0[25] + bf0[6]; |
7198 | 0 | bf1[26] = -bf0[26] + bf0[5]; |
7199 | 0 | bf1[27] = -bf0[27] + bf0[4]; |
7200 | 0 | bf1[28] = -bf0[28] + bf0[3]; |
7201 | 0 | bf1[29] = -bf0[29] + bf0[2]; |
7202 | 0 | bf1[30] = -bf0[30] + bf0[1]; |
7203 | 0 | bf1[31] = -bf0[31] + bf0[0]; |
7204 | 0 | bf1[32] = bf0[32]; |
7205 | 0 | bf1[33] = bf0[33]; |
7206 | 0 | bf1[34] = bf0[34]; |
7207 | 0 | bf1[35] = bf0[35]; |
7208 | 0 | bf1[36] = bf0[36]; |
7209 | 0 | bf1[37] = bf0[37]; |
7210 | 0 | bf1[38] = bf0[38]; |
7211 | 0 | bf1[39] = bf0[39]; |
7212 | 0 | bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); |
7213 | 0 | bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); |
7214 | 0 | bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); |
7215 | 0 | bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); |
7216 | 0 | bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); |
7217 | 0 | bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); |
7218 | 0 | bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); |
7219 | 0 | bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); |
7220 | 0 | bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); |
7221 | 0 | bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); |
7222 | 0 | bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); |
7223 | 0 | bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); |
7224 | 0 | bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); |
7225 | 0 | bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); |
7226 | 0 | bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); |
7227 | 0 | bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); |
7228 | 0 | bf1[56] = bf0[56]; |
7229 | 0 | bf1[57] = bf0[57]; |
7230 | 0 | bf1[58] = bf0[58]; |
7231 | 0 | bf1[59] = bf0[59]; |
7232 | 0 | bf1[60] = bf0[60]; |
7233 | 0 | bf1[61] = bf0[61]; |
7234 | 0 | bf1[62] = bf0[62]; |
7235 | 0 | bf1[63] = bf0[63]; |
7236 | | |
7237 | | // stage 3 |
7238 | 0 | cospi = cospi_arr(cos_bit); |
7239 | 0 | bf0 = step; |
7240 | 0 | bf1 = output; |
7241 | 0 | bf1[0] = bf0[0] + bf0[15]; |
7242 | 0 | bf1[1] = bf0[1] + bf0[14]; |
7243 | 0 | bf1[2] = bf0[2] + bf0[13]; |
7244 | 0 | bf1[3] = bf0[3] + bf0[12]; |
7245 | 0 | bf1[4] = bf0[4] + bf0[11]; |
7246 | 0 | bf1[5] = bf0[5] + bf0[10]; |
7247 | 0 | bf1[6] = bf0[6] + bf0[9]; |
7248 | 0 | bf1[7] = bf0[7] + bf0[8]; |
7249 | 0 | bf1[8] = -bf0[8] + bf0[7]; |
7250 | 0 | bf1[9] = -bf0[9] + bf0[6]; |
7251 | 0 | bf1[10] = -bf0[10] + bf0[5]; |
7252 | 0 | bf1[11] = -bf0[11] + bf0[4]; |
7253 | 0 | bf1[12] = -bf0[12] + bf0[3]; |
7254 | 0 | bf1[13] = -bf0[13] + bf0[2]; |
7255 | 0 | bf1[14] = -bf0[14] + bf0[1]; |
7256 | 0 | bf1[15] = -bf0[15] + bf0[0]; |
7257 | 0 | bf1[16] = bf0[16]; |
7258 | 0 | bf1[17] = bf0[17]; |
7259 | 0 | bf1[18] = bf0[18]; |
7260 | 0 | bf1[19] = bf0[19]; |
7261 | 0 | bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); |
7262 | 0 | bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); |
7263 | 0 | bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); |
7264 | 0 | bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); |
7265 | 0 | bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); |
7266 | 0 | bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); |
7267 | 0 | bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); |
7268 | 0 | bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); |
7269 | 0 | bf1[28] = bf0[28]; |
7270 | 0 | bf1[29] = bf0[29]; |
7271 | 0 | bf1[30] = bf0[30]; |
7272 | 0 | bf1[31] = bf0[31]; |
7273 | 0 | bf1[32] = bf0[32] + bf0[47]; |
7274 | 0 | bf1[33] = bf0[33] + bf0[46]; |
7275 | 0 | bf1[34] = bf0[34] + bf0[45]; |
7276 | 0 | bf1[35] = bf0[35] + bf0[44]; |
7277 | 0 | bf1[36] = bf0[36] + bf0[43]; |
7278 | 0 | bf1[37] = bf0[37] + bf0[42]; |
7279 | 0 | bf1[38] = bf0[38] + bf0[41]; |
7280 | 0 | bf1[39] = bf0[39] + bf0[40]; |
7281 | 0 | bf1[40] = -bf0[40] + bf0[39]; |
7282 | 0 | bf1[41] = -bf0[41] + bf0[38]; |
7283 | 0 | bf1[42] = -bf0[42] + bf0[37]; |
7284 | 0 | bf1[43] = -bf0[43] + bf0[36]; |
7285 | 0 | bf1[44] = -bf0[44] + bf0[35]; |
7286 | 0 | bf1[45] = -bf0[45] + bf0[34]; |
7287 | 0 | bf1[46] = -bf0[46] + bf0[33]; |
7288 | 0 | bf1[47] = -bf0[47] + bf0[32]; |
7289 | 0 | bf1[48] = -bf0[48] + bf0[63]; |
7290 | 0 | bf1[49] = -bf0[49] + bf0[62]; |
7291 | 0 | bf1[50] = -bf0[50] + bf0[61]; |
7292 | 0 | bf1[51] = -bf0[51] + bf0[60]; |
7293 | 0 | bf1[52] = -bf0[52] + bf0[59]; |
7294 | 0 | bf1[53] = -bf0[53] + bf0[58]; |
7295 | 0 | bf1[54] = -bf0[54] + bf0[57]; |
7296 | 0 | bf1[55] = -bf0[55] + bf0[56]; |
7297 | 0 | bf1[56] = bf0[56] + bf0[55]; |
7298 | 0 | bf1[57] = bf0[57] + bf0[54]; |
7299 | 0 | bf1[58] = bf0[58] + bf0[53]; |
7300 | 0 | bf1[59] = bf0[59] + bf0[52]; |
7301 | 0 | bf1[60] = bf0[60] + bf0[51]; |
7302 | 0 | bf1[61] = bf0[61] + bf0[50]; |
7303 | 0 | bf1[62] = bf0[62] + bf0[49]; |
7304 | 0 | bf1[63] = bf0[63] + bf0[48]; |
7305 | | |
7306 | | // stage 4 |
7307 | 0 | cospi = cospi_arr(cos_bit); |
7308 | 0 | bf0 = output; |
7309 | 0 | bf1 = step; |
7310 | 0 | bf1[0] = bf0[0] + bf0[7]; |
7311 | 0 | bf1[1] = bf0[1] + bf0[6]; |
7312 | 0 | bf1[2] = bf0[2] + bf0[5]; |
7313 | 0 | bf1[3] = bf0[3] + bf0[4]; |
7314 | 0 | bf1[4] = -bf0[4] + bf0[3]; |
7315 | 0 | bf1[5] = -bf0[5] + bf0[2]; |
7316 | 0 | bf1[6] = -bf0[6] + bf0[1]; |
7317 | 0 | bf1[7] = -bf0[7] + bf0[0]; |
7318 | 0 | bf1[8] = bf0[8]; |
7319 | 0 | bf1[9] = bf0[9]; |
7320 | 0 | bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); |
7321 | 0 | bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); |
7322 | 0 | bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); |
7323 | 0 | bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); |
7324 | 0 | bf1[14] = bf0[14]; |
7325 | 0 | bf1[15] = bf0[15]; |
7326 | 0 | bf1[16] = bf0[16] + bf0[23]; |
7327 | 0 | bf1[17] = bf0[17] + bf0[22]; |
7328 | 0 | bf1[18] = bf0[18] + bf0[21]; |
7329 | 0 | bf1[19] = bf0[19] + bf0[20]; |
7330 | 0 | bf1[20] = -bf0[20] + bf0[19]; |
7331 | 0 | bf1[21] = -bf0[21] + bf0[18]; |
7332 | 0 | bf1[22] = -bf0[22] + bf0[17]; |
7333 | 0 | bf1[23] = -bf0[23] + bf0[16]; |
7334 | 0 | bf1[24] = -bf0[24] + bf0[31]; |
7335 | 0 | bf1[25] = -bf0[25] + bf0[30]; |
7336 | 0 | bf1[26] = -bf0[26] + bf0[29]; |
7337 | 0 | bf1[27] = -bf0[27] + bf0[28]; |
7338 | 0 | bf1[28] = bf0[28] + bf0[27]; |
7339 | 0 | bf1[29] = bf0[29] + bf0[26]; |
7340 | 0 | bf1[30] = bf0[30] + bf0[25]; |
7341 | 0 | bf1[31] = bf0[31] + bf0[24]; |
7342 | 0 | bf1[32] = bf0[32]; |
7343 | 0 | bf1[33] = bf0[33]; |
7344 | 0 | bf1[34] = bf0[34]; |
7345 | 0 | bf1[35] = bf0[35]; |
7346 | 0 | bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); |
7347 | 0 | bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); |
7348 | 0 | bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); |
7349 | 0 | bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); |
7350 | 0 | bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); |
7351 | 0 | bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); |
7352 | 0 | bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); |
7353 | 0 | bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); |
7354 | 0 | bf1[44] = bf0[44]; |
7355 | 0 | bf1[45] = bf0[45]; |
7356 | 0 | bf1[46] = bf0[46]; |
7357 | 0 | bf1[47] = bf0[47]; |
7358 | 0 | bf1[48] = bf0[48]; |
7359 | 0 | bf1[49] = bf0[49]; |
7360 | 0 | bf1[50] = bf0[50]; |
7361 | 0 | bf1[51] = bf0[51]; |
7362 | 0 | bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); |
7363 | 0 | bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); |
7364 | 0 | bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); |
7365 | 0 | bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); |
7366 | 0 | bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); |
7367 | 0 | bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); |
7368 | 0 | bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); |
7369 | 0 | bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); |
7370 | 0 | bf1[60] = bf0[60]; |
7371 | 0 | bf1[61] = bf0[61]; |
7372 | 0 | bf1[62] = bf0[62]; |
7373 | 0 | bf1[63] = bf0[63]; |
7374 | | |
7375 | | // stage 5 |
7376 | 0 | cospi = cospi_arr(cos_bit); |
7377 | 0 | bf0 = step; |
7378 | 0 | bf1 = output; |
7379 | 0 | bf1[0] = bf0[0] + bf0[3]; |
7380 | 0 | bf1[1] = bf0[1] + bf0[2]; |
7381 | 0 | bf1[4] = bf0[4]; |
7382 | 0 | bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); |
7383 | 0 | bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); |
7384 | 0 | bf1[7] = bf0[7]; |
7385 | 0 | bf1[8] = bf0[8] + bf0[11]; |
7386 | 0 | bf1[9] = bf0[9] + bf0[10]; |
7387 | 0 | bf1[10] = -bf0[10] + bf0[9]; |
7388 | 0 | bf1[11] = -bf0[11] + bf0[8]; |
7389 | 0 | bf1[12] = -bf0[12] + bf0[15]; |
7390 | 0 | bf1[13] = -bf0[13] + bf0[14]; |
7391 | 0 | bf1[14] = bf0[14] + bf0[13]; |
7392 | 0 | bf1[15] = bf0[15] + bf0[12]; |
7393 | 0 | bf1[16] = bf0[16]; |
7394 | 0 | bf1[17] = bf0[17]; |
7395 | 0 | bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); |
7396 | 0 | bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); |
7397 | 0 | bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); |
7398 | 0 | bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); |
7399 | 0 | bf1[22] = bf0[22]; |
7400 | 0 | bf1[23] = bf0[23]; |
7401 | 0 | bf1[24] = bf0[24]; |
7402 | 0 | bf1[25] = bf0[25]; |
7403 | 0 | bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); |
7404 | 0 | bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); |
7405 | 0 | bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); |
7406 | 0 | bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); |
7407 | 0 | bf1[30] = bf0[30]; |
7408 | 0 | bf1[31] = bf0[31]; |
7409 | 0 | bf1[32] = bf0[32] + bf0[39]; |
7410 | 0 | bf1[33] = bf0[33] + bf0[38]; |
7411 | 0 | bf1[34] = bf0[34] + bf0[37]; |
7412 | 0 | bf1[35] = bf0[35] + bf0[36]; |
7413 | 0 | bf1[36] = -bf0[36] + bf0[35]; |
7414 | 0 | bf1[37] = -bf0[37] + bf0[34]; |
7415 | 0 | bf1[38] = -bf0[38] + bf0[33]; |
7416 | 0 | bf1[39] = -bf0[39] + bf0[32]; |
7417 | 0 | bf1[40] = -bf0[40] + bf0[47]; |
7418 | 0 | bf1[41] = -bf0[41] + bf0[46]; |
7419 | 0 | bf1[42] = -bf0[42] + bf0[45]; |
7420 | 0 | bf1[43] = -bf0[43] + bf0[44]; |
7421 | 0 | bf1[44] = bf0[44] + bf0[43]; |
7422 | 0 | bf1[45] = bf0[45] + bf0[42]; |
7423 | 0 | bf1[46] = bf0[46] + bf0[41]; |
7424 | 0 | bf1[47] = bf0[47] + bf0[40]; |
7425 | 0 | bf1[48] = bf0[48] + bf0[55]; |
7426 | 0 | bf1[49] = bf0[49] + bf0[54]; |
7427 | 0 | bf1[50] = bf0[50] + bf0[53]; |
7428 | 0 | bf1[51] = bf0[51] + bf0[52]; |
7429 | 0 | bf1[52] = -bf0[52] + bf0[51]; |
7430 | 0 | bf1[53] = -bf0[53] + bf0[50]; |
7431 | 0 | bf1[54] = -bf0[54] + bf0[49]; |
7432 | 0 | bf1[55] = -bf0[55] + bf0[48]; |
7433 | 0 | bf1[56] = -bf0[56] + bf0[63]; |
7434 | 0 | bf1[57] = -bf0[57] + bf0[62]; |
7435 | 0 | bf1[58] = -bf0[58] + bf0[61]; |
7436 | 0 | bf1[59] = -bf0[59] + bf0[60]; |
7437 | 0 | bf1[60] = bf0[60] + bf0[59]; |
7438 | 0 | bf1[61] = bf0[61] + bf0[58]; |
7439 | 0 | bf1[62] = bf0[62] + bf0[57]; |
7440 | 0 | bf1[63] = bf0[63] + bf0[56]; |
7441 | | |
7442 | | // stage 6 |
7443 | 0 | cospi = cospi_arr(cos_bit); |
7444 | 0 | bf0 = output; |
7445 | 0 | bf1 = step; |
7446 | 0 | bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); |
7447 | 0 | bf1[4] = bf0[4] + bf0[5]; |
7448 | 0 | bf1[7] = bf0[7] + bf0[6]; |
7449 | 0 | bf1[8] = bf0[8]; |
7450 | 0 | bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); |
7451 | 0 | bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); |
7452 | 0 | bf1[11] = bf0[11]; |
7453 | 0 | bf1[12] = bf0[12]; |
7454 | 0 | bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); |
7455 | 0 | bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); |
7456 | 0 | bf1[15] = bf0[15]; |
7457 | 0 | bf1[16] = bf0[16] + bf0[19]; |
7458 | 0 | bf1[17] = bf0[17] + bf0[18]; |
7459 | 0 | bf1[18] = -bf0[18] + bf0[17]; |
7460 | 0 | bf1[19] = -bf0[19] + bf0[16]; |
7461 | 0 | bf1[20] = -bf0[20] + bf0[23]; |
7462 | 0 | bf1[21] = -bf0[21] + bf0[22]; |
7463 | 0 | bf1[22] = bf0[22] + bf0[21]; |
7464 | 0 | bf1[23] = bf0[23] + bf0[20]; |
7465 | 0 | bf1[24] = bf0[24] + bf0[27]; |
7466 | 0 | bf1[25] = bf0[25] + bf0[26]; |
7467 | 0 | bf1[26] = -bf0[26] + bf0[25]; |
7468 | 0 | bf1[27] = -bf0[27] + bf0[24]; |
7469 | 0 | bf1[28] = -bf0[28] + bf0[31]; |
7470 | 0 | bf1[29] = -bf0[29] + bf0[30]; |
7471 | 0 | bf1[30] = bf0[30] + bf0[29]; |
7472 | 0 | bf1[31] = bf0[31] + bf0[28]; |
7473 | 0 | bf1[32] = bf0[32]; |
7474 | 0 | bf1[33] = bf0[33]; |
7475 | 0 | bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); |
7476 | 0 | bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); |
7477 | 0 | bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); |
7478 | 0 | bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); |
7479 | 0 | bf1[38] = bf0[38]; |
7480 | 0 | bf1[39] = bf0[39]; |
7481 | 0 | bf1[40] = bf0[40]; |
7482 | 0 | bf1[41] = bf0[41]; |
7483 | 0 | bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); |
7484 | 0 | bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); |
7485 | 0 | bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); |
7486 | 0 | bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); |
7487 | 0 | bf1[46] = bf0[46]; |
7488 | 0 | bf1[47] = bf0[47]; |
7489 | 0 | bf1[48] = bf0[48]; |
7490 | 0 | bf1[49] = bf0[49]; |
7491 | 0 | bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); |
7492 | 0 | bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); |
7493 | 0 | bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); |
7494 | 0 | bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); |
7495 | 0 | bf1[54] = bf0[54]; |
7496 | 0 | bf1[55] = bf0[55]; |
7497 | 0 | bf1[56] = bf0[56]; |
7498 | 0 | bf1[57] = bf0[57]; |
7499 | 0 | bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); |
7500 | 0 | bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); |
7501 | 0 | bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); |
7502 | 0 | bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); |
7503 | 0 | bf1[62] = bf0[62]; |
7504 | 0 | bf1[63] = bf0[63]; |
7505 | | |
7506 | | // stage 7 |
7507 | 0 | cospi = cospi_arr(cos_bit); |
7508 | 0 | bf0 = step; |
7509 | 0 | bf1 = output; |
7510 | 0 | bf1[0] = bf0[0]; |
7511 | 0 | bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); |
7512 | 0 | bf1[8] = bf0[8] + bf0[9]; |
7513 | 0 | bf1[11] = bf0[11] + bf0[10]; |
7514 | 0 | bf1[12] = bf0[12] + bf0[13]; |
7515 | 0 | bf1[15] = bf0[15] + bf0[14]; |
7516 | 0 | bf1[16] = bf0[16]; |
7517 | 0 | bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); |
7518 | 0 | bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); |
7519 | 0 | bf1[19] = bf0[19]; |
7520 | 0 | bf1[20] = bf0[20]; |
7521 | 0 | bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); |
7522 | 0 | bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); |
7523 | 0 | bf1[23] = bf0[23]; |
7524 | 0 | bf1[24] = bf0[24]; |
7525 | 0 | bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); |
7526 | 0 | bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); |
7527 | 0 | bf1[27] = bf0[27]; |
7528 | 0 | bf1[28] = bf0[28]; |
7529 | 0 | bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); |
7530 | 0 | bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); |
7531 | 0 | bf1[31] = bf0[31]; |
7532 | 0 | bf1[32] = bf0[32] + bf0[35]; |
7533 | 0 | bf1[33] = bf0[33] + bf0[34]; |
7534 | 0 | bf1[34] = -bf0[34] + bf0[33]; |
7535 | 0 | bf1[35] = -bf0[35] + bf0[32]; |
7536 | 0 | bf1[36] = -bf0[36] + bf0[39]; |
7537 | 0 | bf1[37] = -bf0[37] + bf0[38]; |
7538 | 0 | bf1[38] = bf0[38] + bf0[37]; |
7539 | 0 | bf1[39] = bf0[39] + bf0[36]; |
7540 | 0 | bf1[40] = bf0[40] + bf0[43]; |
7541 | 0 | bf1[41] = bf0[41] + bf0[42]; |
7542 | 0 | bf1[42] = -bf0[42] + bf0[41]; |
7543 | 0 | bf1[43] = -bf0[43] + bf0[40]; |
7544 | 0 | bf1[44] = -bf0[44] + bf0[47]; |
7545 | 0 | bf1[45] = -bf0[45] + bf0[46]; |
7546 | 0 | bf1[46] = bf0[46] + bf0[45]; |
7547 | 0 | bf1[47] = bf0[47] + bf0[44]; |
7548 | 0 | bf1[48] = bf0[48] + bf0[51]; |
7549 | 0 | bf1[49] = bf0[49] + bf0[50]; |
7550 | 0 | bf1[50] = -bf0[50] + bf0[49]; |
7551 | 0 | bf1[51] = -bf0[51] + bf0[48]; |
7552 | 0 | bf1[52] = -bf0[52] + bf0[55]; |
7553 | 0 | bf1[53] = -bf0[53] + bf0[54]; |
7554 | 0 | bf1[54] = bf0[54] + bf0[53]; |
7555 | 0 | bf1[55] = bf0[55] + bf0[52]; |
7556 | 0 | bf1[56] = bf0[56] + bf0[59]; |
7557 | 0 | bf1[57] = bf0[57] + bf0[58]; |
7558 | 0 | bf1[58] = -bf0[58] + bf0[57]; |
7559 | 0 | bf1[59] = -bf0[59] + bf0[56]; |
7560 | 0 | bf1[60] = -bf0[60] + bf0[63]; |
7561 | 0 | bf1[61] = -bf0[61] + bf0[62]; |
7562 | 0 | bf1[62] = bf0[62] + bf0[61]; |
7563 | 0 | bf1[63] = bf0[63] + bf0[60]; |
7564 | | |
7565 | | // stage 8 |
7566 | 0 | cospi = cospi_arr(cos_bit); |
7567 | 0 | bf0 = output; |
7568 | 0 | bf1 = step; |
7569 | 0 | bf1[0] = bf0[0]; |
7570 | 0 | bf1[4] = bf0[4]; |
7571 | 0 | bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); |
7572 | 0 | bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); |
7573 | 0 | bf1[16] = bf0[16] + bf0[17]; |
7574 | 0 | bf1[19] = bf0[19] + bf0[18]; |
7575 | 0 | bf1[20] = bf0[20] + bf0[21]; |
7576 | 0 | bf1[23] = bf0[23] + bf0[22]; |
7577 | 0 | bf1[24] = bf0[24] + bf0[25]; |
7578 | 0 | bf1[27] = bf0[27] + bf0[26]; |
7579 | 0 | bf1[28] = bf0[28] + bf0[29]; |
7580 | 0 | bf1[31] = bf0[31] + bf0[30]; |
7581 | 0 | bf1[32] = bf0[32]; |
7582 | 0 | bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); |
7583 | 0 | bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); |
7584 | 0 | bf1[35] = bf0[35]; |
7585 | 0 | bf1[36] = bf0[36]; |
7586 | 0 | bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); |
7587 | 0 | bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); |
7588 | 0 | bf1[39] = bf0[39]; |
7589 | 0 | bf1[40] = bf0[40]; |
7590 | 0 | bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); |
7591 | 0 | bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); |
7592 | 0 | bf1[43] = bf0[43]; |
7593 | 0 | bf1[44] = bf0[44]; |
7594 | 0 | bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); |
7595 | 0 | bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); |
7596 | 0 | bf1[47] = bf0[47]; |
7597 | 0 | bf1[48] = bf0[48]; |
7598 | 0 | bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); |
7599 | 0 | bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); |
7600 | 0 | bf1[51] = bf0[51]; |
7601 | 0 | bf1[52] = bf0[52]; |
7602 | 0 | bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); |
7603 | 0 | bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); |
7604 | 0 | bf1[55] = bf0[55]; |
7605 | 0 | bf1[56] = bf0[56]; |
7606 | 0 | bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); |
7607 | 0 | bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); |
7608 | 0 | bf1[59] = bf0[59]; |
7609 | 0 | bf1[60] = bf0[60]; |
7610 | 0 | bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); |
7611 | 0 | bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); |
7612 | 0 | bf1[63] = bf0[63]; |
7613 | | |
7614 | | // stage 9 |
7615 | 0 | cospi = cospi_arr(cos_bit); |
7616 | 0 | bf0 = step; |
7617 | 0 | bf1 = output; |
7618 | 0 | bf1[0] = bf0[0]; |
7619 | 0 | bf1[4] = bf0[4]; |
7620 | 0 | bf1[8] = bf0[8]; |
7621 | 0 | bf1[12] = bf0[12]; |
7622 | 0 | bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); |
7623 | 0 | bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); |
7624 | 0 | bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); |
7625 | 0 | bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); |
7626 | 0 | bf1[32] = bf0[32] + bf0[33]; |
7627 | 0 | bf1[35] = bf0[35] + bf0[34]; |
7628 | 0 | bf1[36] = bf0[36] + bf0[37]; |
7629 | 0 | bf1[39] = bf0[39] + bf0[38]; |
7630 | 0 | bf1[40] = bf0[40] + bf0[41]; |
7631 | 0 | bf1[43] = bf0[43] + bf0[42]; |
7632 | 0 | bf1[44] = bf0[44] + bf0[45]; |
7633 | 0 | bf1[47] = bf0[47] + bf0[46]; |
7634 | 0 | bf1[48] = bf0[48] + bf0[49]; |
7635 | 0 | bf1[51] = bf0[51] + bf0[50]; |
7636 | 0 | bf1[52] = bf0[52] + bf0[53]; |
7637 | 0 | bf1[55] = bf0[55] + bf0[54]; |
7638 | 0 | bf1[56] = bf0[56] + bf0[57]; |
7639 | 0 | bf1[59] = bf0[59] + bf0[58]; |
7640 | 0 | bf1[60] = bf0[60] + bf0[61]; |
7641 | 0 | bf1[63] = bf0[63] + bf0[62]; |
7642 | | |
7643 | | // stage 10 |
7644 | 0 | cospi = cospi_arr(cos_bit); |
7645 | 0 | bf0 = output; |
7646 | 0 | bf1 = step; |
7647 | 0 | bf1[0] = bf0[0]; |
7648 | 0 | bf1[4] = bf0[4]; |
7649 | 0 | bf1[8] = bf0[8]; |
7650 | 0 | bf1[12] = bf0[12]; |
7651 | 0 | bf1[16] = bf0[16]; |
7652 | 0 | bf1[20] = bf0[20]; |
7653 | 0 | bf1[24] = bf0[24]; |
7654 | 0 | bf1[28] = bf0[28]; |
7655 | 0 | bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); |
7656 | 0 | bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); |
7657 | 0 | bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); |
7658 | 0 | bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); |
7659 | 0 | bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); |
7660 | 0 | bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); |
7661 | 0 | bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); |
7662 | 0 | bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); |
7663 | | |
7664 | | // stage 11 |
7665 | 0 | bf0 = step; |
7666 | 0 | bf1 = output; |
7667 | 0 | bf1[0] = bf0[0]; |
7668 | 0 | bf1[1] = bf0[32]; |
7669 | 0 | bf1[2] = bf0[16]; |
7670 | 0 | bf1[3] = bf0[48]; |
7671 | 0 | bf1[4] = bf0[8]; |
7672 | 0 | bf1[5] = bf0[40]; |
7673 | 0 | bf1[6] = bf0[24]; |
7674 | 0 | bf1[7] = bf0[56]; |
7675 | 0 | bf1[8] = bf0[4]; |
7676 | 0 | bf1[9] = bf0[36]; |
7677 | 0 | bf1[10] = bf0[20]; |
7678 | 0 | bf1[11] = bf0[52]; |
7679 | 0 | bf1[12] = bf0[12]; |
7680 | 0 | bf1[13] = bf0[44]; |
7681 | 0 | bf1[14] = bf0[28]; |
7682 | 0 | bf1[15] = bf0[60]; |
7683 | 0 | } |
7684 | | |
7685 | 0 | static void av1_fidentity64_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) { |
7686 | 0 | (void)stage_range; |
7687 | 0 | (void)cos_bit; |
7688 | 0 | for (int32_t i = 0; i < 16; ++i) { |
7689 | 0 | output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits); |
7690 | 0 | } |
7691 | 0 | assert(stage_range[0] + new_sqrt2_bits <= 32); |
7692 | 0 | } |
7693 | | |
7694 | 0 | static INLINE TxfmFunc fwd_txfm_type_to_func_N4(TxfmType txfmtype) { |
7695 | 0 | switch (txfmtype) { |
7696 | 0 | case TXFM_TYPE_DCT4: |
7697 | 0 | return svt_av1_fdct4_new_N4; |
7698 | 0 | case TXFM_TYPE_DCT8: |
7699 | 0 | return svt_av1_fdct8_new_N4; |
7700 | 0 | case TXFM_TYPE_DCT16: |
7701 | 0 | return svt_av1_fdct16_new_N4; |
7702 | 0 | case TXFM_TYPE_DCT32: |
7703 | 0 | return svt_av1_fdct32_new_N4; |
7704 | 0 | case TXFM_TYPE_DCT64: |
7705 | 0 | return svt_av1_fdct64_new_N4; |
7706 | 0 | case TXFM_TYPE_ADST4: |
7707 | 0 | return svt_av1_fadst4_new_N4; |
7708 | 0 | case TXFM_TYPE_ADST8: |
7709 | 0 | return svt_av1_fadst8_new_N4; |
7710 | 0 | case TXFM_TYPE_ADST16: |
7711 | 0 | return svt_av1_fadst16_new_N4; |
7712 | 0 | case TXFM_TYPE_ADST32: |
7713 | 0 | return av1_fadst32_new; |
7714 | 0 | case TXFM_TYPE_IDENTITY4: |
7715 | 0 | return svt_av1_fidentity4_N4_c; |
7716 | 0 | case TXFM_TYPE_IDENTITY8: |
7717 | 0 | return svt_av1_fidentity8_N4_c; |
7718 | 0 | case TXFM_TYPE_IDENTITY16: |
7719 | 0 | return svt_av1_fidentity16_N4_c; |
7720 | 0 | case TXFM_TYPE_IDENTITY32: |
7721 | 0 | return svt_av1_fidentity32_N4_c; |
7722 | 0 | case TXFM_TYPE_IDENTITY64: |
7723 | 0 | return av1_fidentity64_N4_c; |
7724 | 0 | default: |
7725 | 0 | assert(0); |
7726 | 0 | return NULL; |
7727 | 0 | } |
7728 | 0 | } |
7729 | | |
7730 | | static INLINE void av1_tranform_two_d_core_N4_c(int16_t* input, uint32_t input_stride, int32_t* output, |
7731 | 0 | const Txfm2dFlipCfg* cfg, int32_t* buf, uint8_t bit_depth) { |
7732 | 0 | int32_t c, r; |
7733 | | // Note when assigning txfm_size_col, we use the txfm_size from the |
7734 | | // row configuration and vice versa. This is intentionally done to |
7735 | | // accurately perform rectangular transforms. When the transform is |
7736 | | // rectangular, the number of columns will be the same as the |
7737 | | // txfm_size stored in the row cfg struct. It will make no difference |
7738 | | // for square transforms. |
7739 | 0 | const int32_t txfm_size_col = tx_size_wide[cfg->tx_size]; |
7740 | 0 | const int32_t txfm_size_row = tx_size_high[cfg->tx_size]; |
7741 | | // Take the shift from the larger dimension in the rectangular case. |
7742 | 0 | const int8_t* shift = cfg->shift; |
7743 | 0 | const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
7744 | 0 | int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; |
7745 | 0 | int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; |
7746 | 0 | assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); |
7747 | 0 | assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); |
7748 | 0 | svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth); |
7749 | |
|
7750 | 0 | const int8_t cos_bit_col = cfg->cos_bit_col; |
7751 | 0 | const int8_t cos_bit_row = cfg->cos_bit_row; |
7752 | 0 | const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N4(cfg->txfm_type_col); |
7753 | 0 | const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N4(cfg->txfm_type_row); |
7754 | 0 | ASSERT(txfm_func_col != NULL); |
7755 | 0 | ASSERT(txfm_func_row != NULL); |
7756 | | // use output buffer as temp buffer |
7757 | 0 | int32_t* temp_in = output; |
7758 | 0 | int32_t* temp_out = output + txfm_size_row; |
7759 | | |
7760 | | // Columns |
7761 | 0 | for (c = 0; c < txfm_size_col; ++c) { |
7762 | 0 | if (cfg->ud_flip == 0) { |
7763 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
7764 | 0 | temp_in[r] = input[r * input_stride + c]; |
7765 | 0 | } |
7766 | 0 | } else { |
7767 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
7768 | | // flip upside down |
7769 | 0 | temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c]; |
7770 | 0 | } |
7771 | 0 | } |
7772 | 0 | svt_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c |
7773 | 0 | txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); |
7774 | 0 | svt_av1_round_shift_array_c(temp_out, txfm_size_row / 4, -shift[1]); // NM svt_av1_round_shift_array_c |
7775 | 0 | if (cfg->lr_flip == 0) { |
7776 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
7777 | 0 | buf[r * txfm_size_col + c] = temp_out[r]; |
7778 | 0 | } |
7779 | 0 | } else { |
7780 | 0 | for (r = 0; r < txfm_size_row; ++r) { |
7781 | | // flip from left to right |
7782 | 0 | buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; |
7783 | 0 | } |
7784 | 0 | } |
7785 | 0 | } |
7786 | | |
7787 | | // Rows |
7788 | 0 | for (r = 0; r < txfm_size_row / 4; ++r) { |
7789 | 0 | txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row); |
7790 | 0 | svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 4, -shift[2]); |
7791 | |
|
7792 | 0 | if (abs(rect_type) == 1) { |
7793 | | // Multiply everything by Sqrt2 if the transform is rectangular and the |
7794 | | // size difference is a factor of 2. |
7795 | 0 | for (c = 0; c < txfm_size_col / 4; ++c) { |
7796 | 0 | output[r * txfm_size_col + c] = round_shift((int64_t)output[r * txfm_size_col + c] * new_sqrt2, |
7797 | 0 | new_sqrt2_bits); |
7798 | 0 | } |
7799 | 0 | } |
7800 | 0 | } |
7801 | 0 | for (int i = 0; i < (txfm_size_col * txfm_size_row); i++) { |
7802 | 0 | if (i % txfm_size_col >= (txfm_size_col >> 2) || i / txfm_size_col >= (txfm_size_row >> 2)) { |
7803 | 0 | output[i] = 0; |
7804 | 0 | } |
7805 | 0 | } |
7806 | 0 | } |
7807 | | |
7808 | | void svt_aom_transform_two_d_64x64_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7809 | 0 | uint8_t bit_depth) { |
7810 | 0 | int32_t intermediate_transform_buffer[64 * 64]; |
7811 | 0 | Txfm2dFlipCfg cfg; |
7812 | 0 | svt_aom_transform_config(transform_type, TX_64X64, &cfg); |
7813 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7814 | 0 | } |
7815 | | |
7816 | | void svt_aom_transform_two_d_32x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7817 | 0 | uint8_t bit_depth) { |
7818 | 0 | int32_t intermediate_transform_buffer[32 * 32]; |
7819 | 0 | Txfm2dFlipCfg cfg; |
7820 | 0 | svt_aom_transform_config(transform_type, TX_32X32, &cfg); |
7821 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7822 | 0 | } |
7823 | | |
7824 | | void svt_aom_transform_two_d_16x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7825 | 0 | uint8_t bit_depth) { |
7826 | 0 | int32_t intermediate_transform_buffer[16 * 16]; |
7827 | 0 | Txfm2dFlipCfg cfg; |
7828 | 0 | svt_aom_transform_config(transform_type, TX_16X16, &cfg); |
7829 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7830 | 0 | } |
7831 | | |
7832 | | void svt_aom_transform_two_d_8x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7833 | 0 | uint8_t bit_depth) { |
7834 | 0 | int32_t intermediate_transform_buffer[8 * 8]; |
7835 | 0 | Txfm2dFlipCfg cfg; |
7836 | 0 | svt_aom_transform_config(transform_type, TX_8X8, &cfg); |
7837 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7838 | 0 | } |
7839 | | |
7840 | | void svt_aom_transform_two_d_4x4_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7841 | 0 | uint8_t bit_depth) { |
7842 | 0 | int32_t intermediate_transform_buffer[4 * 4]; |
7843 | 0 | Txfm2dFlipCfg cfg; |
7844 | 0 | svt_aom_transform_config(transform_type, TX_4X4, &cfg); |
7845 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7846 | 0 | } |
7847 | | |
7848 | | void svt_av1_fwd_txfm2d_64x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7849 | 0 | uint8_t bit_depth) { |
7850 | 0 | int32_t intermediate_transform_buffer[64 * 32]; |
7851 | 0 | Txfm2dFlipCfg cfg; |
7852 | 0 | svt_aom_transform_config(transform_type, TX_64X32, &cfg); |
7853 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7854 | 0 | } |
7855 | | |
7856 | | void svt_av1_fwd_txfm2d_32x64_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7857 | 0 | uint8_t bit_depth) { |
7858 | 0 | int32_t intermediate_transform_buffer[32 * 64]; |
7859 | 0 | Txfm2dFlipCfg cfg; |
7860 | 0 | svt_aom_transform_config(transform_type, TX_32X64, &cfg); |
7861 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7862 | 0 | } |
7863 | | |
7864 | | void svt_av1_fwd_txfm2d_64x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7865 | 0 | uint8_t bit_depth) { |
7866 | 0 | int32_t intermediate_transform_buffer[64 * 16]; |
7867 | 0 | Txfm2dFlipCfg cfg; |
7868 | 0 | svt_aom_transform_config(transform_type, TX_64X16, &cfg); |
7869 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7870 | 0 | } |
7871 | | |
7872 | | void svt_av1_fwd_txfm2d_16x64_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7873 | 0 | uint8_t bit_depth) { |
7874 | 0 | int32_t intermediate_transform_buffer[16 * 64]; |
7875 | 0 | Txfm2dFlipCfg cfg; |
7876 | 0 | svt_aom_transform_config(transform_type, TX_16X64, &cfg); |
7877 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7878 | 0 | } |
7879 | | |
7880 | | void svt_av1_fwd_txfm2d_32x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7881 | 0 | uint8_t bit_depth) { |
7882 | 0 | int32_t intermediate_transform_buffer[32 * 16]; |
7883 | 0 | Txfm2dFlipCfg cfg; |
7884 | 0 | svt_aom_transform_config(transform_type, TX_32X16, &cfg); |
7885 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7886 | 0 | } |
7887 | | |
7888 | | void svt_av1_fwd_txfm2d_16x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7889 | 0 | uint8_t bit_depth) { |
7890 | 0 | int32_t intermediate_transform_buffer[16 * 32]; |
7891 | 0 | Txfm2dFlipCfg cfg; |
7892 | 0 | svt_aom_transform_config(transform_type, TX_16X32, &cfg); |
7893 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7894 | 0 | } |
7895 | | |
7896 | | void svt_av1_fwd_txfm2d_16x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7897 | 0 | uint8_t bit_depth) { |
7898 | 0 | int32_t intermediate_transform_buffer[16 * 8]; |
7899 | 0 | Txfm2dFlipCfg cfg; |
7900 | 0 | svt_aom_transform_config(transform_type, TX_16X8, &cfg); |
7901 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7902 | 0 | } |
7903 | | |
7904 | | void svt_av1_fwd_txfm2d_8x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7905 | 0 | uint8_t bit_depth) { |
7906 | 0 | int32_t intermediate_transform_buffer[8 * 16]; |
7907 | 0 | Txfm2dFlipCfg cfg; |
7908 | 0 | svt_aom_transform_config(transform_type, TX_8X16, &cfg); |
7909 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7910 | 0 | } |
7911 | | |
7912 | | void svt_av1_fwd_txfm2d_32x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7913 | 0 | uint8_t bit_depth) { |
7914 | 0 | int32_t intermediate_transform_buffer[32 * 8]; |
7915 | 0 | Txfm2dFlipCfg cfg; |
7916 | 0 | svt_aom_transform_config(transform_type, TX_32X8, &cfg); |
7917 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7918 | 0 | } |
7919 | | |
7920 | | void svt_av1_fwd_txfm2d_8x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7921 | 0 | uint8_t bit_depth) { |
7922 | 0 | int32_t intermediate_transform_buffer[8 * 32]; |
7923 | 0 | Txfm2dFlipCfg cfg; |
7924 | 0 | svt_aom_transform_config(transform_type, TX_8X32, &cfg); |
7925 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7926 | 0 | } |
7927 | | |
7928 | | void svt_av1_fwd_txfm2d_16x4_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7929 | 0 | uint8_t bit_depth) { |
7930 | 0 | int32_t intermediate_transform_buffer[16 * 4]; |
7931 | 0 | Txfm2dFlipCfg cfg; |
7932 | 0 | svt_aom_transform_config(transform_type, TX_16X4, &cfg); |
7933 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7934 | 0 | } |
7935 | | |
7936 | | void svt_av1_fwd_txfm2d_4x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7937 | 0 | uint8_t bit_depth) { |
7938 | 0 | int32_t intermediate_transform_buffer[4 * 16]; |
7939 | 0 | Txfm2dFlipCfg cfg; |
7940 | 0 | svt_aom_transform_config(transform_type, TX_4X16, &cfg); |
7941 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7942 | 0 | } |
7943 | | |
7944 | | void svt_av1_fwd_txfm2d_8x4_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7945 | 0 | uint8_t bit_depth) { |
7946 | 0 | int32_t intermediate_transform_buffer[8 * 4]; |
7947 | 0 | Txfm2dFlipCfg cfg; |
7948 | 0 | svt_aom_transform_config(transform_type, TX_8X4, &cfg); |
7949 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7950 | 0 | } |
7951 | | |
7952 | | void svt_av1_fwd_txfm2d_4x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type, |
7953 | 0 | uint8_t bit_depth) { |
7954 | 0 | int32_t intermediate_transform_buffer[4 * 8]; |
7955 | 0 | Txfm2dFlipCfg cfg; |
7956 | 0 | svt_aom_transform_config(transform_type, TX_4X8, &cfg); |
7957 | 0 | av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth); |
7958 | 0 | } |