Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/transforms.c
Line
Count
Source
1
/*
2
* Copyright(c) 2019 Intel Corporation
3
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
4
*
5
* This source code is subject to the terms of the BSD 2 Clause License and
6
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7
* was not distributed with this source code in the LICENSE file, you can
8
* obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9
* Media Patent License 1.0 was not distributed with this source code in the
10
* PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11
*/
12
13
#include <stdlib.h>
14
#include "transforms.h"
15
#include "aom_dsp_rtcd.h"
16
17
const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/][MAX_TXWH_IDX /*txh_idx*/] = {
18
    {13, 13, 13, 0, 0}, {13, 13, 13, 12, 0}, {13, 13, 13, 12, 13}, {0, 13, 13, 12, 13}, {0, 0, 13, 12, 13}};
19
const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/][MAX_TXWH_IDX /*txh_idx*/] = {
20
    {13, 13, 12, 0, 0}, {13, 13, 13, 12, 0}, {13, 13, 12, 13, 12}, {0, 12, 13, 12, 11}, {0, 0, 12, 11, 10}};
21
22
const uint8_t tx_blocks_per_depth[BLOCK_SIZES_ALL][MAX_VARTX_DEPTH + 1] = {
23
    {1, 1, 1}, // BLOCK_4X4
24
    {1, 1, 1}, // BLOCK_4X8
25
    {1, 1, 1}, // BLOCK_8X4
26
    {1, 4, 4}, // BLOCK_8X8
27
    {1, 2, 8}, // BLOCK_8X16
28
    {1, 2, 8}, // BLOCK_16X8
29
    {1, 4, 16}, // BLOCK_16X16
30
    {1, 2, 8}, // BLOCK_16X32
31
    {1, 2, 8}, // BLOCK_32X16
32
    {1, 4, 16}, // BLOCK_32X32
33
    {1, 2, 8}, // BLOCK_32X64
34
    {1, 2, 8}, // BLOCK_64X32
35
    {1, 4, 16}, // BLOCK_64X64
36
    {2, 2, 2}, // BLOCK_64X128
37
    {2, 2, 2}, // BLOCK_128X64
38
    {4, 4, 4}, // BLOCK_128X128
39
    {1, 2, 4}, // BLOCK_4X16
40
    {1, 2, 4}, // BLOCK_16X4
41
    {1, 2, 4}, // BLOCK_8X32
42
    {1, 2, 4}, // BLOCK_32X8
43
    {1, 2, 4}, // BLOCK_16X64
44
    {1, 2, 4} // BLOCK_64X16
45
};
46
47
// origin is block - separate tables for INTRA (idx 0) and INTER (idx 1) needed b/c of tx depth 2
48
const Position tx_org[BLOCK_SIZES_ALL][2 /*is_inter*/][MAX_VARTX_DEPTH + 1][MAX_TXB_COUNT] = {
49
    {// BLOCK_4X4
50
     {// intra
51
      {// tx_depth 0
52
       {0, 0}},
53
      {// tx_depth 1
54
       {0, 0}},
55
      {// tx_depth 2
56
       {0, 0}}},
57
     {// inter
58
      {// tx_depth 0
59
       {0, 0}},
60
      {// tx_depth 1
61
       {0, 0}},
62
      {// tx_depth 2
63
       {0, 0}}}},
64
    {// BLOCK_4X8
65
     {// intra
66
      {// tx_depth 0
67
       {0, 0}},
68
      {// tx_depth 1
69
       {0, 0}},
70
      {// tx_depth 2
71
       {0, 0}}},
72
     {// inter
73
      {// tx_depth 0
74
       {0, 0}},
75
      {// tx_depth 1
76
       {0, 0}},
77
      {// tx_depth 2
78
       {0, 0}}}},
79
    {// BLOCK_8X4
80
     {// intra
81
      {// tx_depth 0
82
       {0, 0}},
83
      {// tx_depth 1
84
       {0, 0}},
85
      {// tx_depth 2
86
       {0, 0}}},
87
     {// inter
88
      {// tx_depth 0
89
       {0, 0}},
90
      {// tx_depth 1
91
       {0, 0}},
92
      {// tx_depth 2
93
       {0, 0}}}},
94
    {// BLOCK_8X8
95
     {// intra
96
      {// tx_depth 0
97
       {0, 0}},
98
      {// tx_depth 1
99
       {0, 0},
100
       {4, 0},
101
       {0, 4},
102
       {4, 4}},
103
      {
104
          // tx_depth 2
105
          {0, 0} // not allowed
106
      }},
107
     {// inter
108
      {// tx_depth 0
109
       {0, 0}},
110
      {// tx_depth 1
111
       {0, 0},
112
       {4, 0},
113
       {0, 4},
114
       {4, 4}},
115
      {
116
          // tx_depth 2
117
          {0, 0} // not allowed
118
      }}},
119
    {// BLOCK_8X16
120
     {// intra
121
      {// tx_depth 0
122
       {0, 0}},
123
      {// tx_depth 1
124
       {0, 0},
125
       {0, 8}},
126
      {// tx_depth 2
127
       {0, 0},
128
       {4, 0},
129
       {0, 4},
130
       {4, 4},
131
       {0, 8},
132
       {4, 8},
133
       {0, 12},
134
       {4, 12}}},
135
     {// inter
136
      {// tx_depth 0
137
       {0, 0}},
138
      {// tx_depth 1
139
       {0, 0},
140
       {0, 8}},
141
      {// tx_depth 2
142
       {0, 0},
143
       {4, 0},
144
       {0, 4},
145
       {4, 4},
146
       {0, 8},
147
       {4, 8},
148
       {0, 12},
149
       {4, 12}}}},
150
    {// BLOCK_16X8
151
     {// intra
152
      {// tx_depth 0
153
       {0, 0}},
154
      {// tx_depth 1
155
       {0, 0},
156
       {8, 0}},
157
      {// tx_depth 2
158
       {0, 0},
159
       {4, 0},
160
       {8, 0},
161
       {12, 0},
162
       {0, 4},
163
       {4, 4},
164
       {8, 4},
165
       {12, 4}}},
166
     {// inter
167
      {// tx_depth 0
168
       {0, 0}},
169
      {// tx_depth 1
170
       {0, 0},
171
       {8, 0}},
172
      {// tx_depth 2
173
       {0, 0},
174
       {4, 0},
175
       {0, 4},
176
       {4, 4},
177
       {8, 0},
178
       {12, 0},
179
       {8, 4},
180
       {12, 4}}}},
181
    {// BLOCK_16X16
182
     {// intra
183
      {// tx_depth 0
184
       {0, 0}},
185
      {// tx_depth 1
186
       {0, 0},
187
       {8, 0},
188
       {0, 8},
189
       {8, 8}},
190
      {// tx_depth 2
191
       {0, 0},
192
       {4, 0},
193
       {8, 0},
194
       {12, 0},
195
       {0, 4},
196
       {4, 4},
197
       {8, 4},
198
       {12, 4},
199
       {0, 8},
200
       {4, 8},
201
       {8, 8},
202
       {12, 8},
203
       {0, 12},
204
       {4, 12},
205
       {8, 12},
206
       {12, 12}}},
207
     {// inter
208
      {// tx_depth 0
209
       {0, 0}},
210
      {// tx_depth 1
211
       {0, 0},
212
       {8, 0},
213
       {0, 8},
214
       {8, 8}},
215
      {// tx_depth 2
216
       {0, 0},
217
       {4, 0},
218
       {0, 4},
219
       {4, 4},
220
       {8, 0},
221
       {12, 0},
222
       {8, 4},
223
       {12, 4},
224
       {0, 8},
225
       {4, 8},
226
       {0, 12},
227
       {4, 12},
228
       {8, 8},
229
       {12, 8},
230
       {8, 12},
231
       {12, 12}}}},
232
    {// BLOCK_16X32
233
     {// intra
234
      {// tx_depth 0
235
       {0, 0}},
236
      {// tx_depth 1
237
       {0, 0},
238
       {0, 16}},
239
      {// tx_depth 2
240
       {0, 0},
241
       {8, 0},
242
       {0, 8},
243
       {8, 8},
244
       {0, 16},
245
       {8, 16},
246
       {0, 24},
247
       {8, 24}}},
248
     {// inter
249
      {// tx_depth 0
250
       {0, 0}},
251
      {// tx_depth 1
252
       {0, 0},
253
       {0, 16}},
254
      {// tx_depth 2
255
       {0, 0},
256
       {8, 0},
257
       {0, 8},
258
       {8, 8},
259
       {0, 16},
260
       {8, 16},
261
       {0, 24},
262
       {8, 24}}}},
263
    {// BLOCK_32X16
264
     {// intra
265
      {// tx_depth 0
266
       {0, 0}},
267
      {// tx_depth 1
268
       {0, 0},
269
       {16, 0}},
270
      {// tx_depth 2
271
       {0, 0},
272
       {8, 0},
273
       {16, 0},
274
       {24, 0},
275
       {0, 8},
276
       {8, 8},
277
       {16, 8},
278
       {24, 8}}},
279
     {// inter
280
      {// tx_depth 0
281
       {0, 0}},
282
      {// tx_depth 1
283
       {0, 0},
284
       {16, 0}},
285
      {// tx_depth 2
286
       {0, 0},
287
       {8, 0},
288
       {0, 8},
289
       {8, 8},
290
       {16, 0},
291
       {24, 0},
292
       {16, 8},
293
       {24, 8}}}},
294
    {// BLOCK_32X32
295
     {// intra
296
      {// tx_depth 0
297
       {0, 0}},
298
      {// tx_depth 1
299
       {0, 0},
300
       {16, 0},
301
       {0, 16},
302
       {16, 16}},
303
      {// tx_depth 2
304
       {0, 0},
305
       {8, 0},
306
       {16, 0},
307
       {24, 0},
308
       {0, 8},
309
       {8, 8},
310
       {16, 8},
311
       {24, 8},
312
       {0, 16},
313
       {8, 16},
314
       {16, 16},
315
       {24, 16},
316
       {0, 24},
317
       {8, 24},
318
       {16, 24},
319
       {24, 24}}},
320
     {// inter
321
      {// tx_depth 0
322
       {0, 0}},
323
      {// tx_depth 1
324
       {0, 0},
325
       {16, 0},
326
       {0, 16},
327
       {16, 16}},
328
      {// tx_depth 2
329
       {0, 0},
330
       {8, 0},
331
       {0, 8},
332
       {8, 8},
333
       {16, 0},
334
       {24, 0},
335
       {16, 8},
336
       {24, 8},
337
       {0, 16},
338
       {8, 16},
339
       {0, 24},
340
       {8, 24},
341
       {16, 16},
342
       {24, 16},
343
       {16, 24},
344
       {24, 24}}}},
345
    {// BLOCK_32X64
346
     {// intra
347
      {// tx_depth 0
348
       {0, 0}},
349
      {// tx_depth 1
350
       {0, 0},
351
       {0, 32}},
352
      {// tx_depth 2
353
       {0, 0},
354
       {16, 0},
355
       {0, 16},
356
       {16, 16},
357
       {0, 32},
358
       {16, 32},
359
       {0, 48},
360
       {16, 48}}},
361
     {// inter
362
      {// tx_depth 0
363
       {0, 0}},
364
      {// tx_depth 1
365
       {0, 0},
366
       {0, 32}},
367
      {// tx_depth 2
368
       {0, 0},
369
       {16, 0},
370
       {0, 16},
371
       {16, 16},
372
       {0, 32},
373
       {16, 32},
374
       {0, 48},
375
       {16, 48}}}},
376
    {// BLOCK_64X32
377
     {// intra
378
      {// tx_depth 0
379
       {0, 0}},
380
      {// tx_depth 1
381
       {0, 0},
382
       {32, 0}},
383
      {// tx_depth 2
384
       {0, 0},
385
       {16, 0},
386
       {32, 0},
387
       {48, 0},
388
       {0, 16},
389
       {16, 16},
390
       {32, 16},
391
       {48, 16}}},
392
     {// inter
393
      {// tx_depth 0
394
       {0, 0}},
395
      {// tx_depth 1
396
       {0, 0},
397
       {32, 0}},
398
      {// tx_depth 2
399
       {0, 0},
400
       {16, 0},
401
       {0, 16},
402
       {16, 16},
403
       {32, 0},
404
       {48, 0},
405
       {32, 16},
406
       {48, 16}}}},
407
    {// BLOCK_64X64
408
     {// intra
409
      {// tx_depth 0
410
       {0, 0}},
411
      {// tx_depth 1
412
       {0, 0},
413
       {32, 0},
414
       {0, 32},
415
       {32, 32}},
416
      {// tx_depth 2
417
       {0, 0},
418
       {16, 0},
419
       {32, 0},
420
       {48, 0},
421
       {0, 16},
422
       {16, 16},
423
       {32, 16},
424
       {48, 16},
425
       {0, 32},
426
       {16, 32},
427
       {32, 32},
428
       {48, 32},
429
       {0, 48},
430
       {16, 48},
431
       {32, 48},
432
       {48, 48}}},
433
     {// inter
434
      {// tx_depth 0
435
       {0, 0}},
436
      {// tx_depth 1
437
       {0, 0},
438
       {32, 0},
439
       {0, 32},
440
       {32, 32}},
441
      {// tx_depth 2
442
       {0, 0},
443
       {16, 0},
444
       {0, 16},
445
       {16, 16},
446
       {32, 0},
447
       {48, 0},
448
       {32, 16},
449
       {48, 16},
450
       {0, 32},
451
       {16, 32},
452
       {0, 48},
453
       {16, 48},
454
       {32, 32},
455
       {48, 32},
456
       {32, 48},
457
       {48, 48}}}},
458
    {// BLOCK_64X128
459
     {// intra
460
      {// tx_depth 0
461
       {0, 0},
462
       {0, 64}},
463
      {// tx_depth 1
464
       {0, 0},
465
       {0, 64}},
466
      {// tx_depth 2
467
       {0, 0},
468
       {0, 64}}},
469
     {// inter
470
      {// tx_depth 0
471
       {0, 0},
472
       {0, 64}},
473
      {// tx_depth 1
474
       {0, 0},
475
       {0, 64}},
476
      {// tx_depth 2
477
       {0, 0},
478
       {0, 64}}}},
479
    {// BLOCK_128X64
480
     {// intra
481
      {// tx_depth 0
482
       {0, 0},
483
       {64, 0}},
484
      {// tx_depth 1
485
       {0, 0},
486
       {64, 0}},
487
      {// tx_depth 2
488
       {0, 0},
489
       {64, 0}}},
490
     {// inter
491
      {// tx_depth 0
492
       {0, 0},
493
       {64, 0}},
494
      {// tx_depth 1
495
       {0, 0},
496
       {64, 0}},
497
      {// tx_depth 2
498
       {0, 0},
499
       {64, 0}}}},
500
    {// BLOCK_128X128
501
     {// intra
502
      {// tx_depth 0
503
       {0, 0},
504
       {64, 0},
505
       {0, 64},
506
       {64, 64}},
507
      {// tx_depth 1
508
       {0, 0},
509
       {64, 0},
510
       {0, 64},
511
       {64, 64}},
512
      {// tx_depth 2
513
       {0, 0},
514
       {64, 0},
515
       {0, 64},
516
       {64, 64}}},
517
     {// inter
518
      {// tx_depth 0
519
       {0, 0},
520
       {64, 0},
521
       {0, 64},
522
       {64, 64}},
523
      {// tx_depth 1
524
       {0, 0},
525
       {64, 0},
526
       {0, 64},
527
       {64, 64}},
528
      {// tx_depth 2
529
       {0, 0},
530
       {64, 0},
531
       {0, 64},
532
       {64, 64}}}},
533
    {// BLOCK_4X16
534
     {// intra
535
      {// tx_depth 0
536
       {0, 0}},
537
      {// tx_depth 1
538
       {0, 0},
539
       {0, 8}},
540
      {// tx_depth 2
541
       {0, 0},
542
       {0, 4},
543
       {0, 8},
544
       {0, 12}}},
545
     {// inter
546
      {// tx_depth 0
547
       {0, 0}},
548
      {// tx_depth 1
549
       {0, 0},
550
       {0, 8}},
551
      {// tx_depth 2
552
       {0, 0},
553
       {0, 4},
554
       {0, 8},
555
       {0, 12}}}},
556
    {// BLOCK_16X4
557
     {// intra
558
      {// tx_depth 0
559
       {0, 0}},
560
      {// tx_depth 1
561
       {0, 0},
562
       {8, 0}},
563
      {// tx_depth 2
564
       {0, 0},
565
       {4, 0},
566
       {8, 0},
567
       {12, 0}}},
568
     {// inter
569
      {// tx_depth 0
570
       {0, 0}},
571
      {// tx_depth 1
572
       {0, 0},
573
       {8, 0}},
574
      {// tx_depth 2
575
       {0, 0},
576
       {4, 0},
577
       {8, 0},
578
       {12, 0}}}},
579
    {// BLOCK_8X32
580
     {// intra
581
      {// tx_depth 0
582
       {0, 0}},
583
      {// tx_depth 1
584
       {0, 0},
585
       {0, 16}},
586
      {// tx_depth 2
587
       {0, 0},
588
       {0, 8},
589
       {0, 16},
590
       {0, 24}}},
591
     {// inter
592
      {// tx_depth 0
593
       {0, 0}},
594
      {// tx_depth 1
595
       {0, 0},
596
       {0, 16}},
597
      {// tx_depth 2
598
       {0, 0},
599
       {0, 8},
600
       {0, 16},
601
       {0, 24}}}},
602
    {// BLOCK_32X8
603
     {// intra
604
      {// tx_depth 0
605
       {0, 0}},
606
      {// tx_depth 1
607
       {0, 0},
608
       {16, 0}},
609
      {// tx_depth 2
610
       {0, 0},
611
       {8, 0},
612
       {16, 0},
613
       {24, 0}}},
614
     {// inter
615
      {// tx_depth 0
616
       {0, 0}},
617
      {// tx_depth 1
618
       {0, 0},
619
       {16, 0}},
620
      {// tx_depth 2
621
       {0, 0},
622
       {8, 0},
623
       {16, 0},
624
       {24, 0}}}},
625
    {// BLOCK_16X64
626
     {// intra
627
      {// tx_depth 0
628
       {0, 0}},
629
      {// tx_depth 1
630
       {0, 0},
631
       {0, 32}},
632
      {// tx_depth 2
633
       {0, 0},
634
       {0, 16},
635
       {0, 32},
636
       {0, 48}}},
637
     {// inter
638
      {// tx_depth 0
639
       {0, 0}},
640
      {// tx_depth 1
641
       {0, 0},
642
       {0, 32}},
643
      {// tx_depth 2
644
       {0, 0},
645
       {0, 16},
646
       {0, 32},
647
       {0, 48}}}},
648
    {// BLOCK_64X16
649
     {// intra
650
      {// tx_depth 0
651
       {0, 0}},
652
      {// tx_depth 1
653
       {0, 0},
654
       {32, 0}},
655
      {// tx_depth 2
656
       {0, 0},
657
       {16, 0},
658
       {32, 0},
659
       {48, 0}}},
660
     {// inter
661
      {// tx_depth 0
662
       {0, 0}},
663
      {// tx_depth 1
664
       {0, 0},
665
       {32, 0}},
666
      {// tx_depth 2
667
       {0, 0},
668
       {16, 0},
669
       {32, 0},
670
       {48, 0}}}}};
671
672
static const int8_t fdct4_range_mult2[4]    = {0, 2, 3, 3};
673
static const int8_t fdct8_range_mult2[6]    = {0, 2, 4, 5, 5, 5};
674
static const int8_t fdct16_range_mult2[8]   = {0, 2, 4, 6, 7, 7, 7, 7};
675
static const int8_t fdct32_range_mult2[10]  = {0, 2, 4, 6, 8, 9, 9, 9, 9, 9};
676
static const int8_t fdct64_range_mult2[12]  = {0, 2, 4, 6, 8, 10, 11, 11, 11, 11, 11, 11};
677
static const int8_t fadst4_range_mult2[7]   = {0, 2, 4, 3, 3, 3, 3};
678
static const int8_t fadst8_range_mult2[8]   = {0, 0, 1, 3, 3, 5, 5, 5};
679
static const int8_t fadst16_range_mult2[10] = {0, 0, 1, 3, 3, 5, 5, 7, 7, 7};
680
static const int8_t fadst32_range_mult2[12] = {0, 0, 1, 3, 3, 5, 5, 7, 7, 9, 9, 9};
681
static const int8_t fidtx4_range_mult2[1]   = {1};
682
static const int8_t fidtx8_range_mult2[1]   = {2};
683
static const int8_t fidtx16_range_mult2[1]  = {3};
684
static const int8_t fidtx32_range_mult2[1]  = {4};
685
static const int8_t fidtx64_range_mult2[1]  = {5};
686
687
static const int8_t* fwd_txfm_range_mult2_list[TXFM_TYPES] = {fdct4_range_mult2,
688
                                                              fdct8_range_mult2,
689
                                                              fdct16_range_mult2,
690
                                                              fdct32_range_mult2,
691
                                                              fdct64_range_mult2,
692
                                                              fadst4_range_mult2,
693
                                                              fadst8_range_mult2,
694
                                                              fadst16_range_mult2,
695
                                                              fadst32_range_mult2,
696
                                                              fidtx4_range_mult2,
697
                                                              fidtx8_range_mult2,
698
                                                              fidtx16_range_mult2,
699
                                                              fidtx32_range_mult2,
700
                                                              fidtx64_range_mult2};
701
702
static const int8_t fwd_shift_4x4[3]   = {2, 0, 0};
703
static const int8_t fwd_shift_8x8[3]   = {2, -1, 0};
704
static const int8_t fwd_shift_16x16[3] = {2, -2, 0};
705
static const int8_t fwd_shift_32x32[3] = {2, -4, 0};
706
static const int8_t fwd_shift_64x64[3] = {0, -2, -2};
707
static const int8_t fwd_shift_4x8[3]   = {2, -1, 0};
708
static const int8_t fwd_shift_8x4[3]   = {2, -1, 0};
709
static const int8_t fwd_shift_8x16[3]  = {2, -2, 0};
710
static const int8_t fwd_shift_16x8[3]  = {2, -2, 0};
711
static const int8_t fwd_shift_16x32[3] = {2, -4, 0};
712
static const int8_t fwd_shift_32x16[3] = {2, -4, 0};
713
static const int8_t fwd_shift_32x64[3] = {0, -2, -2};
714
static const int8_t fwd_shift_64x32[3] = {2, -4, -2};
715
static const int8_t fwd_shift_4x16[3]  = {2, -1, 0};
716
static const int8_t fwd_shift_16x4[3]  = {2, -1, 0};
717
static const int8_t fwd_shift_8x32[3]  = {2, -2, 0};
718
static const int8_t fwd_shift_32x8[3]  = {2, -2, 0};
719
static const int8_t fwd_shift_16x64[3] = {0, -2, 0};
720
static const int8_t fwd_shift_64x16[3] = {2, -4, 0};
721
722
const int8_t* fwd_txfm_shift_ls[TX_SIZES_ALL] = {
723
    fwd_shift_4x4,  fwd_shift_8x8,  fwd_shift_16x16, fwd_shift_32x32, fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,
724
    fwd_shift_8x16, fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, fwd_shift_64x32, fwd_shift_4x16,
725
    fwd_shift_16x4, fwd_shift_8x32, fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
726
};
727
728
void svt_av1_gen_fwd_stage_range(int8_t* stage_range_col, int8_t* stage_range_row, const Txfm2dFlipCfg* cfg,
729
30.1k
                                 int32_t bd) {
730
    // Take the shift from the larger dimension in the rectangular case.
731
30.1k
    const int8_t* shift = cfg->shift;
732
    // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
733
262k
    for (int32_t i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
734
232k
        stage_range_col[i] = (int8_t)(cfg->stage_range_col[i] + shift[0] + bd + 1);
735
232k
    }
736
    // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
737
267k
    for (int32_t i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
738
237k
        stage_range_row[i] = (int8_t)(cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1);
739
237k
    }
740
30.1k
}
741
742
44.9k
void svt_av1_fdct4_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
743
44.9k
    (void)stage_range;
744
44.9k
    const int32_t* cospi;
745
746
44.9k
    int32_t *bf0, *bf1;
747
44.9k
    int32_t  step[4];
748
749
    // stage 0;
750
751
    // stage 1;
752
44.9k
    bf1    = output;
753
44.9k
    bf1[0] = input[0] + input[3];
754
44.9k
    bf1[1] = input[1] + input[2];
755
44.9k
    bf1[2] = -input[2] + input[1];
756
44.9k
    bf1[3] = -input[3] + input[0];
757
758
    // stage 2
759
44.9k
    cospi  = cospi_arr(cos_bit);
760
44.9k
    bf0    = output;
761
44.9k
    bf1    = step;
762
44.9k
    bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
763
44.9k
    bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
764
44.9k
    bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
765
44.9k
    bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
766
767
    // stage 3
768
44.9k
    bf0    = step;
769
44.9k
    bf1    = output;
770
44.9k
    bf1[0] = bf0[0];
771
44.9k
    bf1[1] = bf0[2];
772
44.9k
    bf1[2] = bf0[1];
773
44.9k
    bf1[3] = bf0[3];
774
44.9k
}
775
776
135k
void svt_av1_fdct8_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
777
135k
    (void)stage_range;
778
135k
    const int32_t* cospi;
779
780
135k
    int32_t *bf0, *bf1;
781
135k
    int32_t  step[8];
782
783
    // stage 0;
784
785
    // stage 1;
786
135k
    bf1    = output;
787
135k
    bf1[0] = input[0] + input[7];
788
135k
    bf1[1] = input[1] + input[6];
789
135k
    bf1[2] = input[2] + input[5];
790
135k
    bf1[3] = input[3] + input[4];
791
135k
    bf1[4] = -input[4] + input[3];
792
135k
    bf1[5] = -input[5] + input[2];
793
135k
    bf1[6] = -input[6] + input[1];
794
135k
    bf1[7] = -input[7] + input[0];
795
796
    // stage 2
797
135k
    cospi  = cospi_arr(cos_bit);
798
135k
    bf0    = output;
799
135k
    bf1    = step;
800
135k
    bf1[0] = bf0[0] + bf0[3];
801
135k
    bf1[1] = bf0[1] + bf0[2];
802
135k
    bf1[2] = -bf0[2] + bf0[1];
803
135k
    bf1[3] = -bf0[3] + bf0[0];
804
135k
    bf1[4] = bf0[4];
805
135k
    bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
806
135k
    bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
807
135k
    bf1[7] = bf0[7];
808
809
    // stage 3
810
135k
    cospi  = cospi_arr(cos_bit);
811
135k
    bf0    = step;
812
135k
    bf1    = output;
813
135k
    bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
814
135k
    bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
815
135k
    bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
816
135k
    bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
817
135k
    bf1[4] = bf0[4] + bf0[5];
818
135k
    bf1[5] = -bf0[5] + bf0[4];
819
135k
    bf1[6] = -bf0[6] + bf0[7];
820
135k
    bf1[7] = bf0[7] + bf0[6];
821
822
    // stage 4
823
135k
    cospi  = cospi_arr(cos_bit);
824
135k
    bf0    = output;
825
135k
    bf1    = step;
826
135k
    bf1[0] = bf0[0];
827
135k
    bf1[1] = bf0[1];
828
135k
    bf1[2] = bf0[2];
829
135k
    bf1[3] = bf0[3];
830
135k
    bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
831
135k
    bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
832
135k
    bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
833
135k
    bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
834
835
    // stage 5
836
135k
    bf0    = step;
837
135k
    bf1    = output;
838
135k
    bf1[0] = bf0[0];
839
135k
    bf1[1] = bf0[4];
840
135k
    bf1[2] = bf0[2];
841
135k
    bf1[3] = bf0[6];
842
135k
    bf1[4] = bf0[1];
843
135k
    bf1[5] = bf0[5];
844
135k
    bf1[6] = bf0[3];
845
135k
    bf1[7] = bf0[7];
846
135k
}
847
848
125k
void svt_av1_fdct16_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
849
125k
    (void)stage_range;
850
125k
    const int32_t* cospi;
851
852
125k
    int32_t *bf0, *bf1;
853
125k
    int32_t  step[16];
854
855
    // stage 0;
856
857
    // stage 1;
858
125k
    bf1     = output;
859
125k
    bf1[0]  = input[0] + input[15];
860
125k
    bf1[1]  = input[1] + input[14];
861
125k
    bf1[2]  = input[2] + input[13];
862
125k
    bf1[3]  = input[3] + input[12];
863
125k
    bf1[4]  = input[4] + input[11];
864
125k
    bf1[5]  = input[5] + input[10];
865
125k
    bf1[6]  = input[6] + input[9];
866
125k
    bf1[7]  = input[7] + input[8];
867
125k
    bf1[8]  = -input[8] + input[7];
868
125k
    bf1[9]  = -input[9] + input[6];
869
125k
    bf1[10] = -input[10] + input[5];
870
125k
    bf1[11] = -input[11] + input[4];
871
125k
    bf1[12] = -input[12] + input[3];
872
125k
    bf1[13] = -input[13] + input[2];
873
125k
    bf1[14] = -input[14] + input[1];
874
125k
    bf1[15] = -input[15] + input[0];
875
876
    // stage 2
877
125k
    cospi   = cospi_arr(cos_bit);
878
125k
    bf0     = output;
879
125k
    bf1     = step;
880
125k
    bf1[0]  = bf0[0] + bf0[7];
881
125k
    bf1[1]  = bf0[1] + bf0[6];
882
125k
    bf1[2]  = bf0[2] + bf0[5];
883
125k
    bf1[3]  = bf0[3] + bf0[4];
884
125k
    bf1[4]  = -bf0[4] + bf0[3];
885
125k
    bf1[5]  = -bf0[5] + bf0[2];
886
125k
    bf1[6]  = -bf0[6] + bf0[1];
887
125k
    bf1[7]  = -bf0[7] + bf0[0];
888
125k
    bf1[8]  = bf0[8];
889
125k
    bf1[9]  = bf0[9];
890
125k
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
891
125k
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
892
125k
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
893
125k
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
894
125k
    bf1[14] = bf0[14];
895
125k
    bf1[15] = bf0[15];
896
897
    // stage 3
898
125k
    cospi   = cospi_arr(cos_bit);
899
125k
    bf0     = step;
900
125k
    bf1     = output;
901
125k
    bf1[0]  = bf0[0] + bf0[3];
902
125k
    bf1[1]  = bf0[1] + bf0[2];
903
125k
    bf1[2]  = -bf0[2] + bf0[1];
904
125k
    bf1[3]  = -bf0[3] + bf0[0];
905
125k
    bf1[4]  = bf0[4];
906
125k
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
907
125k
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
908
125k
    bf1[7]  = bf0[7];
909
125k
    bf1[8]  = bf0[8] + bf0[11];
910
125k
    bf1[9]  = bf0[9] + bf0[10];
911
125k
    bf1[10] = -bf0[10] + bf0[9];
912
125k
    bf1[11] = -bf0[11] + bf0[8];
913
125k
    bf1[12] = -bf0[12] + bf0[15];
914
125k
    bf1[13] = -bf0[13] + bf0[14];
915
125k
    bf1[14] = bf0[14] + bf0[13];
916
125k
    bf1[15] = bf0[15] + bf0[12];
917
918
    // stage 4
919
125k
    cospi   = cospi_arr(cos_bit);
920
125k
    bf0     = output;
921
125k
    bf1     = step;
922
125k
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
923
125k
    bf1[1]  = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
924
125k
    bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
925
125k
    bf1[3]  = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
926
125k
    bf1[4]  = bf0[4] + bf0[5];
927
125k
    bf1[5]  = -bf0[5] + bf0[4];
928
125k
    bf1[6]  = -bf0[6] + bf0[7];
929
125k
    bf1[7]  = bf0[7] + bf0[6];
930
125k
    bf1[8]  = bf0[8];
931
125k
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
932
125k
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
933
125k
    bf1[11] = bf0[11];
934
125k
    bf1[12] = bf0[12];
935
125k
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
936
125k
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
937
125k
    bf1[15] = bf0[15];
938
939
    // stage 5
940
125k
    cospi   = cospi_arr(cos_bit);
941
125k
    bf0     = step;
942
125k
    bf1     = output;
943
125k
    bf1[0]  = bf0[0];
944
125k
    bf1[1]  = bf0[1];
945
125k
    bf1[2]  = bf0[2];
946
125k
    bf1[3]  = bf0[3];
947
125k
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
948
125k
    bf1[5]  = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
949
125k
    bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
950
125k
    bf1[7]  = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
951
125k
    bf1[8]  = bf0[8] + bf0[9];
952
125k
    bf1[9]  = -bf0[9] + bf0[8];
953
125k
    bf1[10] = -bf0[10] + bf0[11];
954
125k
    bf1[11] = bf0[11] + bf0[10];
955
125k
    bf1[12] = bf0[12] + bf0[13];
956
125k
    bf1[13] = -bf0[13] + bf0[12];
957
125k
    bf1[14] = -bf0[14] + bf0[15];
958
125k
    bf1[15] = bf0[15] + bf0[14];
959
960
    // stage 6
961
125k
    cospi   = cospi_arr(cos_bit);
962
125k
    bf0     = output;
963
125k
    bf1     = step;
964
125k
    bf1[0]  = bf0[0];
965
125k
    bf1[1]  = bf0[1];
966
125k
    bf1[2]  = bf0[2];
967
125k
    bf1[3]  = bf0[3];
968
125k
    bf1[4]  = bf0[4];
969
125k
    bf1[5]  = bf0[5];
970
125k
    bf1[6]  = bf0[6];
971
125k
    bf1[7]  = bf0[7];
972
125k
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
973
125k
    bf1[9]  = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
974
125k
    bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
975
125k
    bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
976
125k
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
977
125k
    bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
978
125k
    bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
979
125k
    bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
980
981
    // stage 7
982
125k
    bf0     = step;
983
125k
    bf1     = output;
984
125k
    bf1[0]  = bf0[0];
985
125k
    bf1[1]  = bf0[8];
986
125k
    bf1[2]  = bf0[4];
987
125k
    bf1[3]  = bf0[12];
988
125k
    bf1[4]  = bf0[2];
989
125k
    bf1[5]  = bf0[10];
990
125k
    bf1[6]  = bf0[6];
991
125k
    bf1[7]  = bf0[14];
992
125k
    bf1[8]  = bf0[1];
993
125k
    bf1[9]  = bf0[9];
994
125k
    bf1[10] = bf0[5];
995
125k
    bf1[11] = bf0[13];
996
125k
    bf1[12] = bf0[3];
997
125k
    bf1[13] = bf0[11];
998
125k
    bf1[14] = bf0[7];
999
125k
    bf1[15] = bf0[15];
1000
125k
}
1001
1002
573k
void svt_av1_fdct32_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
1003
573k
    (void)stage_range;
1004
573k
    const int32_t* cospi;
1005
1006
573k
    int32_t *bf0, *bf1;
1007
573k
    int32_t  step[32];
1008
1009
    // stage 0;
1010
1011
    // stage 1;
1012
573k
    bf1     = output;
1013
573k
    bf1[0]  = input[0] + input[31];
1014
573k
    bf1[1]  = input[1] + input[30];
1015
573k
    bf1[2]  = input[2] + input[29];
1016
573k
    bf1[3]  = input[3] + input[28];
1017
573k
    bf1[4]  = input[4] + input[27];
1018
573k
    bf1[5]  = input[5] + input[26];
1019
573k
    bf1[6]  = input[6] + input[25];
1020
573k
    bf1[7]  = input[7] + input[24];
1021
573k
    bf1[8]  = input[8] + input[23];
1022
573k
    bf1[9]  = input[9] + input[22];
1023
573k
    bf1[10] = input[10] + input[21];
1024
573k
    bf1[11] = input[11] + input[20];
1025
573k
    bf1[12] = input[12] + input[19];
1026
573k
    bf1[13] = input[13] + input[18];
1027
573k
    bf1[14] = input[14] + input[17];
1028
573k
    bf1[15] = input[15] + input[16];
1029
573k
    bf1[16] = -input[16] + input[15];
1030
573k
    bf1[17] = -input[17] + input[14];
1031
573k
    bf1[18] = -input[18] + input[13];
1032
573k
    bf1[19] = -input[19] + input[12];
1033
573k
    bf1[20] = -input[20] + input[11];
1034
573k
    bf1[21] = -input[21] + input[10];
1035
573k
    bf1[22] = -input[22] + input[9];
1036
573k
    bf1[23] = -input[23] + input[8];
1037
573k
    bf1[24] = -input[24] + input[7];
1038
573k
    bf1[25] = -input[25] + input[6];
1039
573k
    bf1[26] = -input[26] + input[5];
1040
573k
    bf1[27] = -input[27] + input[4];
1041
573k
    bf1[28] = -input[28] + input[3];
1042
573k
    bf1[29] = -input[29] + input[2];
1043
573k
    bf1[30] = -input[30] + input[1];
1044
573k
    bf1[31] = -input[31] + input[0];
1045
1046
    // stage 2
1047
573k
    cospi   = cospi_arr(cos_bit);
1048
573k
    bf0     = output;
1049
573k
    bf1     = step;
1050
573k
    bf1[0]  = bf0[0] + bf0[15];
1051
573k
    bf1[1]  = bf0[1] + bf0[14];
1052
573k
    bf1[2]  = bf0[2] + bf0[13];
1053
573k
    bf1[3]  = bf0[3] + bf0[12];
1054
573k
    bf1[4]  = bf0[4] + bf0[11];
1055
573k
    bf1[5]  = bf0[5] + bf0[10];
1056
573k
    bf1[6]  = bf0[6] + bf0[9];
1057
573k
    bf1[7]  = bf0[7] + bf0[8];
1058
573k
    bf1[8]  = -bf0[8] + bf0[7];
1059
573k
    bf1[9]  = -bf0[9] + bf0[6];
1060
573k
    bf1[10] = -bf0[10] + bf0[5];
1061
573k
    bf1[11] = -bf0[11] + bf0[4];
1062
573k
    bf1[12] = -bf0[12] + bf0[3];
1063
573k
    bf1[13] = -bf0[13] + bf0[2];
1064
573k
    bf1[14] = -bf0[14] + bf0[1];
1065
573k
    bf1[15] = -bf0[15] + bf0[0];
1066
573k
    bf1[16] = bf0[16];
1067
573k
    bf1[17] = bf0[17];
1068
573k
    bf1[18] = bf0[18];
1069
573k
    bf1[19] = bf0[19];
1070
573k
    bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1071
573k
    bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1072
573k
    bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1073
573k
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1074
573k
    bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1075
573k
    bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1076
573k
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1077
573k
    bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1078
573k
    bf1[28] = bf0[28];
1079
573k
    bf1[29] = bf0[29];
1080
573k
    bf1[30] = bf0[30];
1081
573k
    bf1[31] = bf0[31];
1082
1083
    // stage 3
1084
573k
    cospi   = cospi_arr(cos_bit);
1085
573k
    bf0     = step;
1086
573k
    bf1     = output;
1087
573k
    bf1[0]  = bf0[0] + bf0[7];
1088
573k
    bf1[1]  = bf0[1] + bf0[6];
1089
573k
    bf1[2]  = bf0[2] + bf0[5];
1090
573k
    bf1[3]  = bf0[3] + bf0[4];
1091
573k
    bf1[4]  = -bf0[4] + bf0[3];
1092
573k
    bf1[5]  = -bf0[5] + bf0[2];
1093
573k
    bf1[6]  = -bf0[6] + bf0[1];
1094
573k
    bf1[7]  = -bf0[7] + bf0[0];
1095
573k
    bf1[8]  = bf0[8];
1096
573k
    bf1[9]  = bf0[9];
1097
573k
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1098
573k
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1099
573k
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1100
573k
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1101
573k
    bf1[14] = bf0[14];
1102
573k
    bf1[15] = bf0[15];
1103
573k
    bf1[16] = bf0[16] + bf0[23];
1104
573k
    bf1[17] = bf0[17] + bf0[22];
1105
573k
    bf1[18] = bf0[18] + bf0[21];
1106
573k
    bf1[19] = bf0[19] + bf0[20];
1107
573k
    bf1[20] = -bf0[20] + bf0[19];
1108
573k
    bf1[21] = -bf0[21] + bf0[18];
1109
573k
    bf1[22] = -bf0[22] + bf0[17];
1110
573k
    bf1[23] = -bf0[23] + bf0[16];
1111
573k
    bf1[24] = -bf0[24] + bf0[31];
1112
573k
    bf1[25] = -bf0[25] + bf0[30];
1113
573k
    bf1[26] = -bf0[26] + bf0[29];
1114
573k
    bf1[27] = -bf0[27] + bf0[28];
1115
573k
    bf1[28] = bf0[28] + bf0[27];
1116
573k
    bf1[29] = bf0[29] + bf0[26];
1117
573k
    bf1[30] = bf0[30] + bf0[25];
1118
573k
    bf1[31] = bf0[31] + bf0[24];
1119
1120
    // stage 4
1121
573k
    cospi   = cospi_arr(cos_bit);
1122
573k
    bf0     = output;
1123
573k
    bf1     = step;
1124
573k
    bf1[0]  = bf0[0] + bf0[3];
1125
573k
    bf1[1]  = bf0[1] + bf0[2];
1126
573k
    bf1[2]  = -bf0[2] + bf0[1];
1127
573k
    bf1[3]  = -bf0[3] + bf0[0];
1128
573k
    bf1[4]  = bf0[4];
1129
573k
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1130
573k
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1131
573k
    bf1[7]  = bf0[7];
1132
573k
    bf1[8]  = bf0[8] + bf0[11];
1133
573k
    bf1[9]  = bf0[9] + bf0[10];
1134
573k
    bf1[10] = -bf0[10] + bf0[9];
1135
573k
    bf1[11] = -bf0[11] + bf0[8];
1136
573k
    bf1[12] = -bf0[12] + bf0[15];
1137
573k
    bf1[13] = -bf0[13] + bf0[14];
1138
573k
    bf1[14] = bf0[14] + bf0[13];
1139
573k
    bf1[15] = bf0[15] + bf0[12];
1140
573k
    bf1[16] = bf0[16];
1141
573k
    bf1[17] = bf0[17];
1142
573k
    bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1143
573k
    bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1144
573k
    bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1145
573k
    bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1146
573k
    bf1[22] = bf0[22];
1147
573k
    bf1[23] = bf0[23];
1148
573k
    bf1[24] = bf0[24];
1149
573k
    bf1[25] = bf0[25];
1150
573k
    bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1151
573k
    bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1152
573k
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1153
573k
    bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1154
573k
    bf1[30] = bf0[30];
1155
573k
    bf1[31] = bf0[31];
1156
1157
    // stage 5
1158
573k
    cospi   = cospi_arr(cos_bit);
1159
573k
    bf0     = step;
1160
573k
    bf1     = output;
1161
573k
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1162
573k
    bf1[1]  = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1163
573k
    bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1164
573k
    bf1[3]  = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1165
573k
    bf1[4]  = bf0[4] + bf0[5];
1166
573k
    bf1[5]  = -bf0[5] + bf0[4];
1167
573k
    bf1[6]  = -bf0[6] + bf0[7];
1168
573k
    bf1[7]  = bf0[7] + bf0[6];
1169
573k
    bf1[8]  = bf0[8];
1170
573k
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1171
573k
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1172
573k
    bf1[11] = bf0[11];
1173
573k
    bf1[12] = bf0[12];
1174
573k
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1175
573k
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1176
573k
    bf1[15] = bf0[15];
1177
573k
    bf1[16] = bf0[16] + bf0[19];
1178
573k
    bf1[17] = bf0[17] + bf0[18];
1179
573k
    bf1[18] = -bf0[18] + bf0[17];
1180
573k
    bf1[19] = -bf0[19] + bf0[16];
1181
573k
    bf1[20] = -bf0[20] + bf0[23];
1182
573k
    bf1[21] = -bf0[21] + bf0[22];
1183
573k
    bf1[22] = bf0[22] + bf0[21];
1184
573k
    bf1[23] = bf0[23] + bf0[20];
1185
573k
    bf1[24] = bf0[24] + bf0[27];
1186
573k
    bf1[25] = bf0[25] + bf0[26];
1187
573k
    bf1[26] = -bf0[26] + bf0[25];
1188
573k
    bf1[27] = -bf0[27] + bf0[24];
1189
573k
    bf1[28] = -bf0[28] + bf0[31];
1190
573k
    bf1[29] = -bf0[29] + bf0[30];
1191
573k
    bf1[30] = bf0[30] + bf0[29];
1192
573k
    bf1[31] = bf0[31] + bf0[28];
1193
1194
    // stage 6
1195
573k
    cospi   = cospi_arr(cos_bit);
1196
573k
    bf0     = output;
1197
573k
    bf1     = step;
1198
573k
    bf1[0]  = bf0[0];
1199
573k
    bf1[1]  = bf0[1];
1200
573k
    bf1[2]  = bf0[2];
1201
573k
    bf1[3]  = bf0[3];
1202
573k
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1203
573k
    bf1[5]  = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1204
573k
    bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1205
573k
    bf1[7]  = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1206
573k
    bf1[8]  = bf0[8] + bf0[9];
1207
573k
    bf1[9]  = -bf0[9] + bf0[8];
1208
573k
    bf1[10] = -bf0[10] + bf0[11];
1209
573k
    bf1[11] = bf0[11] + bf0[10];
1210
573k
    bf1[12] = bf0[12] + bf0[13];
1211
573k
    bf1[13] = -bf0[13] + bf0[12];
1212
573k
    bf1[14] = -bf0[14] + bf0[15];
1213
573k
    bf1[15] = bf0[15] + bf0[14];
1214
573k
    bf1[16] = bf0[16];
1215
573k
    bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1216
573k
    bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1217
573k
    bf1[19] = bf0[19];
1218
573k
    bf1[20] = bf0[20];
1219
573k
    bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1220
573k
    bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1221
573k
    bf1[23] = bf0[23];
1222
573k
    bf1[24] = bf0[24];
1223
573k
    bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1224
573k
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1225
573k
    bf1[27] = bf0[27];
1226
573k
    bf1[28] = bf0[28];
1227
573k
    bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1228
573k
    bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1229
573k
    bf1[31] = bf0[31];
1230
1231
    // stage 7
1232
573k
    cospi   = cospi_arr(cos_bit);
1233
573k
    bf0     = step;
1234
573k
    bf1     = output;
1235
573k
    bf1[0]  = bf0[0];
1236
573k
    bf1[1]  = bf0[1];
1237
573k
    bf1[2]  = bf0[2];
1238
573k
    bf1[3]  = bf0[3];
1239
573k
    bf1[4]  = bf0[4];
1240
573k
    bf1[5]  = bf0[5];
1241
573k
    bf1[6]  = bf0[6];
1242
573k
    bf1[7]  = bf0[7];
1243
573k
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1244
573k
    bf1[9]  = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1245
573k
    bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1246
573k
    bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1247
573k
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1248
573k
    bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1249
573k
    bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1250
573k
    bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1251
573k
    bf1[16] = bf0[16] + bf0[17];
1252
573k
    bf1[17] = -bf0[17] + bf0[16];
1253
573k
    bf1[18] = -bf0[18] + bf0[19];
1254
573k
    bf1[19] = bf0[19] + bf0[18];
1255
573k
    bf1[20] = bf0[20] + bf0[21];
1256
573k
    bf1[21] = -bf0[21] + bf0[20];
1257
573k
    bf1[22] = -bf0[22] + bf0[23];
1258
573k
    bf1[23] = bf0[23] + bf0[22];
1259
573k
    bf1[24] = bf0[24] + bf0[25];
1260
573k
    bf1[25] = -bf0[25] + bf0[24];
1261
573k
    bf1[26] = -bf0[26] + bf0[27];
1262
573k
    bf1[27] = bf0[27] + bf0[26];
1263
573k
    bf1[28] = bf0[28] + bf0[29];
1264
573k
    bf1[29] = -bf0[29] + bf0[28];
1265
573k
    bf1[30] = -bf0[30] + bf0[31];
1266
573k
    bf1[31] = bf0[31] + bf0[30];
1267
1268
    // stage 8
1269
573k
    cospi   = cospi_arr(cos_bit);
1270
573k
    bf0     = output;
1271
573k
    bf1     = step;
1272
573k
    bf1[0]  = bf0[0];
1273
573k
    bf1[1]  = bf0[1];
1274
573k
    bf1[2]  = bf0[2];
1275
573k
    bf1[3]  = bf0[3];
1276
573k
    bf1[4]  = bf0[4];
1277
573k
    bf1[5]  = bf0[5];
1278
573k
    bf1[6]  = bf0[6];
1279
573k
    bf1[7]  = bf0[7];
1280
573k
    bf1[8]  = bf0[8];
1281
573k
    bf1[9]  = bf0[9];
1282
573k
    bf1[10] = bf0[10];
1283
573k
    bf1[11] = bf0[11];
1284
573k
    bf1[12] = bf0[12];
1285
573k
    bf1[13] = bf0[13];
1286
573k
    bf1[14] = bf0[14];
1287
573k
    bf1[15] = bf0[15];
1288
573k
    bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1289
573k
    bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1290
573k
    bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1291
573k
    bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1292
573k
    bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1293
573k
    bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1294
573k
    bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1295
573k
    bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1296
573k
    bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1297
573k
    bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1298
573k
    bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1299
573k
    bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1300
573k
    bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1301
573k
    bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1302
573k
    bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1303
573k
    bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1304
1305
    // stage 9
1306
573k
    bf0     = step;
1307
573k
    bf1     = output;
1308
573k
    bf1[0]  = bf0[0];
1309
573k
    bf1[1]  = bf0[16];
1310
573k
    bf1[2]  = bf0[8];
1311
573k
    bf1[3]  = bf0[24];
1312
573k
    bf1[4]  = bf0[4];
1313
573k
    bf1[5]  = bf0[20];
1314
573k
    bf1[6]  = bf0[12];
1315
573k
    bf1[7]  = bf0[28];
1316
573k
    bf1[8]  = bf0[2];
1317
573k
    bf1[9]  = bf0[18];
1318
573k
    bf1[10] = bf0[10];
1319
573k
    bf1[11] = bf0[26];
1320
573k
    bf1[12] = bf0[6];
1321
573k
    bf1[13] = bf0[22];
1322
573k
    bf1[14] = bf0[14];
1323
573k
    bf1[15] = bf0[30];
1324
573k
    bf1[16] = bf0[1];
1325
573k
    bf1[17] = bf0[17];
1326
573k
    bf1[18] = bf0[9];
1327
573k
    bf1[19] = bf0[25];
1328
573k
    bf1[20] = bf0[5];
1329
573k
    bf1[21] = bf0[21];
1330
573k
    bf1[22] = bf0[13];
1331
573k
    bf1[23] = bf0[29];
1332
573k
    bf1[24] = bf0[3];
1333
573k
    bf1[25] = bf0[19];
1334
573k
    bf1[26] = bf0[11];
1335
573k
    bf1[27] = bf0[27];
1336
573k
    bf1[28] = bf0[7];
1337
573k
    bf1[29] = bf0[23];
1338
573k
    bf1[30] = bf0[15];
1339
573k
    bf1[31] = bf0[31];
1340
573k
}
1341
1342
485k
void svt_av1_fdct64_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
1343
485k
    (void)stage_range;
1344
485k
    const int32_t* cospi;
1345
1346
485k
    int32_t *bf0, *bf1;
1347
485k
    int32_t  step[64];
1348
1349
    // stage 0;
1350
1351
    // stage 1;
1352
485k
    bf1     = output;
1353
485k
    bf1[0]  = input[0] + input[63];
1354
485k
    bf1[1]  = input[1] + input[62];
1355
485k
    bf1[2]  = input[2] + input[61];
1356
485k
    bf1[3]  = input[3] + input[60];
1357
485k
    bf1[4]  = input[4] + input[59];
1358
485k
    bf1[5]  = input[5] + input[58];
1359
485k
    bf1[6]  = input[6] + input[57];
1360
485k
    bf1[7]  = input[7] + input[56];
1361
485k
    bf1[8]  = input[8] + input[55];
1362
485k
    bf1[9]  = input[9] + input[54];
1363
485k
    bf1[10] = input[10] + input[53];
1364
485k
    bf1[11] = input[11] + input[52];
1365
485k
    bf1[12] = input[12] + input[51];
1366
485k
    bf1[13] = input[13] + input[50];
1367
485k
    bf1[14] = input[14] + input[49];
1368
485k
    bf1[15] = input[15] + input[48];
1369
485k
    bf1[16] = input[16] + input[47];
1370
485k
    bf1[17] = input[17] + input[46];
1371
485k
    bf1[18] = input[18] + input[45];
1372
485k
    bf1[19] = input[19] + input[44];
1373
485k
    bf1[20] = input[20] + input[43];
1374
485k
    bf1[21] = input[21] + input[42];
1375
485k
    bf1[22] = input[22] + input[41];
1376
485k
    bf1[23] = input[23] + input[40];
1377
485k
    bf1[24] = input[24] + input[39];
1378
485k
    bf1[25] = input[25] + input[38];
1379
485k
    bf1[26] = input[26] + input[37];
1380
485k
    bf1[27] = input[27] + input[36];
1381
485k
    bf1[28] = input[28] + input[35];
1382
485k
    bf1[29] = input[29] + input[34];
1383
485k
    bf1[30] = input[30] + input[33];
1384
485k
    bf1[31] = input[31] + input[32];
1385
485k
    bf1[32] = -input[32] + input[31];
1386
485k
    bf1[33] = -input[33] + input[30];
1387
485k
    bf1[34] = -input[34] + input[29];
1388
485k
    bf1[35] = -input[35] + input[28];
1389
485k
    bf1[36] = -input[36] + input[27];
1390
485k
    bf1[37] = -input[37] + input[26];
1391
485k
    bf1[38] = -input[38] + input[25];
1392
485k
    bf1[39] = -input[39] + input[24];
1393
485k
    bf1[40] = -input[40] + input[23];
1394
485k
    bf1[41] = -input[41] + input[22];
1395
485k
    bf1[42] = -input[42] + input[21];
1396
485k
    bf1[43] = -input[43] + input[20];
1397
485k
    bf1[44] = -input[44] + input[19];
1398
485k
    bf1[45] = -input[45] + input[18];
1399
485k
    bf1[46] = -input[46] + input[17];
1400
485k
    bf1[47] = -input[47] + input[16];
1401
485k
    bf1[48] = -input[48] + input[15];
1402
485k
    bf1[49] = -input[49] + input[14];
1403
485k
    bf1[50] = -input[50] + input[13];
1404
485k
    bf1[51] = -input[51] + input[12];
1405
485k
    bf1[52] = -input[52] + input[11];
1406
485k
    bf1[53] = -input[53] + input[10];
1407
485k
    bf1[54] = -input[54] + input[9];
1408
485k
    bf1[55] = -input[55] + input[8];
1409
485k
    bf1[56] = -input[56] + input[7];
1410
485k
    bf1[57] = -input[57] + input[6];
1411
485k
    bf1[58] = -input[58] + input[5];
1412
485k
    bf1[59] = -input[59] + input[4];
1413
485k
    bf1[60] = -input[60] + input[3];
1414
485k
    bf1[61] = -input[61] + input[2];
1415
485k
    bf1[62] = -input[62] + input[1];
1416
485k
    bf1[63] = -input[63] + input[0];
1417
1418
    // stage 2
1419
485k
    cospi   = cospi_arr(cos_bit);
1420
485k
    bf0     = output;
1421
485k
    bf1     = step;
1422
485k
    bf1[0]  = bf0[0] + bf0[31];
1423
485k
    bf1[1]  = bf0[1] + bf0[30];
1424
485k
    bf1[2]  = bf0[2] + bf0[29];
1425
485k
    bf1[3]  = bf0[3] + bf0[28];
1426
485k
    bf1[4]  = bf0[4] + bf0[27];
1427
485k
    bf1[5]  = bf0[5] + bf0[26];
1428
485k
    bf1[6]  = bf0[6] + bf0[25];
1429
485k
    bf1[7]  = bf0[7] + bf0[24];
1430
485k
    bf1[8]  = bf0[8] + bf0[23];
1431
485k
    bf1[9]  = bf0[9] + bf0[22];
1432
485k
    bf1[10] = bf0[10] + bf0[21];
1433
485k
    bf1[11] = bf0[11] + bf0[20];
1434
485k
    bf1[12] = bf0[12] + bf0[19];
1435
485k
    bf1[13] = bf0[13] + bf0[18];
1436
485k
    bf1[14] = bf0[14] + bf0[17];
1437
485k
    bf1[15] = bf0[15] + bf0[16];
1438
485k
    bf1[16] = -bf0[16] + bf0[15];
1439
485k
    bf1[17] = -bf0[17] + bf0[14];
1440
485k
    bf1[18] = -bf0[18] + bf0[13];
1441
485k
    bf1[19] = -bf0[19] + bf0[12];
1442
485k
    bf1[20] = -bf0[20] + bf0[11];
1443
485k
    bf1[21] = -bf0[21] + bf0[10];
1444
485k
    bf1[22] = -bf0[22] + bf0[9];
1445
485k
    bf1[23] = -bf0[23] + bf0[8];
1446
485k
    bf1[24] = -bf0[24] + bf0[7];
1447
485k
    bf1[25] = -bf0[25] + bf0[6];
1448
485k
    bf1[26] = -bf0[26] + bf0[5];
1449
485k
    bf1[27] = -bf0[27] + bf0[4];
1450
485k
    bf1[28] = -bf0[28] + bf0[3];
1451
485k
    bf1[29] = -bf0[29] + bf0[2];
1452
485k
    bf1[30] = -bf0[30] + bf0[1];
1453
485k
    bf1[31] = -bf0[31] + bf0[0];
1454
485k
    bf1[32] = bf0[32];
1455
485k
    bf1[33] = bf0[33];
1456
485k
    bf1[34] = bf0[34];
1457
485k
    bf1[35] = bf0[35];
1458
485k
    bf1[36] = bf0[36];
1459
485k
    bf1[37] = bf0[37];
1460
485k
    bf1[38] = bf0[38];
1461
485k
    bf1[39] = bf0[39];
1462
485k
    bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1463
485k
    bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1464
485k
    bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1465
485k
    bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1466
485k
    bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1467
485k
    bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1468
485k
    bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1469
485k
    bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1470
485k
    bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
1471
485k
    bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
1472
485k
    bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
1473
485k
    bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
1474
485k
    bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
1475
485k
    bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
1476
485k
    bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
1477
485k
    bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
1478
485k
    bf1[56] = bf0[56];
1479
485k
    bf1[57] = bf0[57];
1480
485k
    bf1[58] = bf0[58];
1481
485k
    bf1[59] = bf0[59];
1482
485k
    bf1[60] = bf0[60];
1483
485k
    bf1[61] = bf0[61];
1484
485k
    bf1[62] = bf0[62];
1485
485k
    bf1[63] = bf0[63];
1486
1487
    // stage 3
1488
485k
    cospi   = cospi_arr(cos_bit);
1489
485k
    bf0     = step;
1490
485k
    bf1     = output;
1491
485k
    bf1[0]  = bf0[0] + bf0[15];
1492
485k
    bf1[1]  = bf0[1] + bf0[14];
1493
485k
    bf1[2]  = bf0[2] + bf0[13];
1494
485k
    bf1[3]  = bf0[3] + bf0[12];
1495
485k
    bf1[4]  = bf0[4] + bf0[11];
1496
485k
    bf1[5]  = bf0[5] + bf0[10];
1497
485k
    bf1[6]  = bf0[6] + bf0[9];
1498
485k
    bf1[7]  = bf0[7] + bf0[8];
1499
485k
    bf1[8]  = -bf0[8] + bf0[7];
1500
485k
    bf1[9]  = -bf0[9] + bf0[6];
1501
485k
    bf1[10] = -bf0[10] + bf0[5];
1502
485k
    bf1[11] = -bf0[11] + bf0[4];
1503
485k
    bf1[12] = -bf0[12] + bf0[3];
1504
485k
    bf1[13] = -bf0[13] + bf0[2];
1505
485k
    bf1[14] = -bf0[14] + bf0[1];
1506
485k
    bf1[15] = -bf0[15] + bf0[0];
1507
485k
    bf1[16] = bf0[16];
1508
485k
    bf1[17] = bf0[17];
1509
485k
    bf1[18] = bf0[18];
1510
485k
    bf1[19] = bf0[19];
1511
485k
    bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1512
485k
    bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1513
485k
    bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1514
485k
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1515
485k
    bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1516
485k
    bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1517
485k
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1518
485k
    bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1519
485k
    bf1[28] = bf0[28];
1520
485k
    bf1[29] = bf0[29];
1521
485k
    bf1[30] = bf0[30];
1522
485k
    bf1[31] = bf0[31];
1523
485k
    bf1[32] = bf0[32] + bf0[47];
1524
485k
    bf1[33] = bf0[33] + bf0[46];
1525
485k
    bf1[34] = bf0[34] + bf0[45];
1526
485k
    bf1[35] = bf0[35] + bf0[44];
1527
485k
    bf1[36] = bf0[36] + bf0[43];
1528
485k
    bf1[37] = bf0[37] + bf0[42];
1529
485k
    bf1[38] = bf0[38] + bf0[41];
1530
485k
    bf1[39] = bf0[39] + bf0[40];
1531
485k
    bf1[40] = -bf0[40] + bf0[39];
1532
485k
    bf1[41] = -bf0[41] + bf0[38];
1533
485k
    bf1[42] = -bf0[42] + bf0[37];
1534
485k
    bf1[43] = -bf0[43] + bf0[36];
1535
485k
    bf1[44] = -bf0[44] + bf0[35];
1536
485k
    bf1[45] = -bf0[45] + bf0[34];
1537
485k
    bf1[46] = -bf0[46] + bf0[33];
1538
485k
    bf1[47] = -bf0[47] + bf0[32];
1539
485k
    bf1[48] = -bf0[48] + bf0[63];
1540
485k
    bf1[49] = -bf0[49] + bf0[62];
1541
485k
    bf1[50] = -bf0[50] + bf0[61];
1542
485k
    bf1[51] = -bf0[51] + bf0[60];
1543
485k
    bf1[52] = -bf0[52] + bf0[59];
1544
485k
    bf1[53] = -bf0[53] + bf0[58];
1545
485k
    bf1[54] = -bf0[54] + bf0[57];
1546
485k
    bf1[55] = -bf0[55] + bf0[56];
1547
485k
    bf1[56] = bf0[56] + bf0[55];
1548
485k
    bf1[57] = bf0[57] + bf0[54];
1549
485k
    bf1[58] = bf0[58] + bf0[53];
1550
485k
    bf1[59] = bf0[59] + bf0[52];
1551
485k
    bf1[60] = bf0[60] + bf0[51];
1552
485k
    bf1[61] = bf0[61] + bf0[50];
1553
485k
    bf1[62] = bf0[62] + bf0[49];
1554
485k
    bf1[63] = bf0[63] + bf0[48];
1555
1556
    // stage 4
1557
485k
    cospi   = cospi_arr(cos_bit);
1558
485k
    bf0     = output;
1559
485k
    bf1     = step;
1560
485k
    bf1[0]  = bf0[0] + bf0[7];
1561
485k
    bf1[1]  = bf0[1] + bf0[6];
1562
485k
    bf1[2]  = bf0[2] + bf0[5];
1563
485k
    bf1[3]  = bf0[3] + bf0[4];
1564
485k
    bf1[4]  = -bf0[4] + bf0[3];
1565
485k
    bf1[5]  = -bf0[5] + bf0[2];
1566
485k
    bf1[6]  = -bf0[6] + bf0[1];
1567
485k
    bf1[7]  = -bf0[7] + bf0[0];
1568
485k
    bf1[8]  = bf0[8];
1569
485k
    bf1[9]  = bf0[9];
1570
485k
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1571
485k
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1572
485k
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1573
485k
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1574
485k
    bf1[14] = bf0[14];
1575
485k
    bf1[15] = bf0[15];
1576
485k
    bf1[16] = bf0[16] + bf0[23];
1577
485k
    bf1[17] = bf0[17] + bf0[22];
1578
485k
    bf1[18] = bf0[18] + bf0[21];
1579
485k
    bf1[19] = bf0[19] + bf0[20];
1580
485k
    bf1[20] = -bf0[20] + bf0[19];
1581
485k
    bf1[21] = -bf0[21] + bf0[18];
1582
485k
    bf1[22] = -bf0[22] + bf0[17];
1583
485k
    bf1[23] = -bf0[23] + bf0[16];
1584
485k
    bf1[24] = -bf0[24] + bf0[31];
1585
485k
    bf1[25] = -bf0[25] + bf0[30];
1586
485k
    bf1[26] = -bf0[26] + bf0[29];
1587
485k
    bf1[27] = -bf0[27] + bf0[28];
1588
485k
    bf1[28] = bf0[28] + bf0[27];
1589
485k
    bf1[29] = bf0[29] + bf0[26];
1590
485k
    bf1[30] = bf0[30] + bf0[25];
1591
485k
    bf1[31] = bf0[31] + bf0[24];
1592
485k
    bf1[32] = bf0[32];
1593
485k
    bf1[33] = bf0[33];
1594
485k
    bf1[34] = bf0[34];
1595
485k
    bf1[35] = bf0[35];
1596
485k
    bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1597
485k
    bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1598
485k
    bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1599
485k
    bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1600
485k
    bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1601
485k
    bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1602
485k
    bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1603
485k
    bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1604
485k
    bf1[44] = bf0[44];
1605
485k
    bf1[45] = bf0[45];
1606
485k
    bf1[46] = bf0[46];
1607
485k
    bf1[47] = bf0[47];
1608
485k
    bf1[48] = bf0[48];
1609
485k
    bf1[49] = bf0[49];
1610
485k
    bf1[50] = bf0[50];
1611
485k
    bf1[51] = bf0[51];
1612
485k
    bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
1613
485k
    bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
1614
485k
    bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
1615
485k
    bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
1616
485k
    bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
1617
485k
    bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
1618
485k
    bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
1619
485k
    bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
1620
485k
    bf1[60] = bf0[60];
1621
485k
    bf1[61] = bf0[61];
1622
485k
    bf1[62] = bf0[62];
1623
485k
    bf1[63] = bf0[63];
1624
1625
    // stage 5
1626
485k
    cospi   = cospi_arr(cos_bit);
1627
485k
    bf0     = step;
1628
485k
    bf1     = output;
1629
485k
    bf1[0]  = bf0[0] + bf0[3];
1630
485k
    bf1[1]  = bf0[1] + bf0[2];
1631
485k
    bf1[2]  = -bf0[2] + bf0[1];
1632
485k
    bf1[3]  = -bf0[3] + bf0[0];
1633
485k
    bf1[4]  = bf0[4];
1634
485k
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1635
485k
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1636
485k
    bf1[7]  = bf0[7];
1637
485k
    bf1[8]  = bf0[8] + bf0[11];
1638
485k
    bf1[9]  = bf0[9] + bf0[10];
1639
485k
    bf1[10] = -bf0[10] + bf0[9];
1640
485k
    bf1[11] = -bf0[11] + bf0[8];
1641
485k
    bf1[12] = -bf0[12] + bf0[15];
1642
485k
    bf1[13] = -bf0[13] + bf0[14];
1643
485k
    bf1[14] = bf0[14] + bf0[13];
1644
485k
    bf1[15] = bf0[15] + bf0[12];
1645
485k
    bf1[16] = bf0[16];
1646
485k
    bf1[17] = bf0[17];
1647
485k
    bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1648
485k
    bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1649
485k
    bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1650
485k
    bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1651
485k
    bf1[22] = bf0[22];
1652
485k
    bf1[23] = bf0[23];
1653
485k
    bf1[24] = bf0[24];
1654
485k
    bf1[25] = bf0[25];
1655
485k
    bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1656
485k
    bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1657
485k
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1658
485k
    bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1659
485k
    bf1[30] = bf0[30];
1660
485k
    bf1[31] = bf0[31];
1661
485k
    bf1[32] = bf0[32] + bf0[39];
1662
485k
    bf1[33] = bf0[33] + bf0[38];
1663
485k
    bf1[34] = bf0[34] + bf0[37];
1664
485k
    bf1[35] = bf0[35] + bf0[36];
1665
485k
    bf1[36] = -bf0[36] + bf0[35];
1666
485k
    bf1[37] = -bf0[37] + bf0[34];
1667
485k
    bf1[38] = -bf0[38] + bf0[33];
1668
485k
    bf1[39] = -bf0[39] + bf0[32];
1669
485k
    bf1[40] = -bf0[40] + bf0[47];
1670
485k
    bf1[41] = -bf0[41] + bf0[46];
1671
485k
    bf1[42] = -bf0[42] + bf0[45];
1672
485k
    bf1[43] = -bf0[43] + bf0[44];
1673
485k
    bf1[44] = bf0[44] + bf0[43];
1674
485k
    bf1[45] = bf0[45] + bf0[42];
1675
485k
    bf1[46] = bf0[46] + bf0[41];
1676
485k
    bf1[47] = bf0[47] + bf0[40];
1677
485k
    bf1[48] = bf0[48] + bf0[55];
1678
485k
    bf1[49] = bf0[49] + bf0[54];
1679
485k
    bf1[50] = bf0[50] + bf0[53];
1680
485k
    bf1[51] = bf0[51] + bf0[52];
1681
485k
    bf1[52] = -bf0[52] + bf0[51];
1682
485k
    bf1[53] = -bf0[53] + bf0[50];
1683
485k
    bf1[54] = -bf0[54] + bf0[49];
1684
485k
    bf1[55] = -bf0[55] + bf0[48];
1685
485k
    bf1[56] = -bf0[56] + bf0[63];
1686
485k
    bf1[57] = -bf0[57] + bf0[62];
1687
485k
    bf1[58] = -bf0[58] + bf0[61];
1688
485k
    bf1[59] = -bf0[59] + bf0[60];
1689
485k
    bf1[60] = bf0[60] + bf0[59];
1690
485k
    bf1[61] = bf0[61] + bf0[58];
1691
485k
    bf1[62] = bf0[62] + bf0[57];
1692
485k
    bf1[63] = bf0[63] + bf0[56];
1693
1694
    // stage 6
1695
485k
    cospi   = cospi_arr(cos_bit);
1696
485k
    bf0     = output;
1697
485k
    bf1     = step;
1698
485k
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1699
485k
    bf1[1]  = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1700
485k
    bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1701
485k
    bf1[3]  = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1702
485k
    bf1[4]  = bf0[4] + bf0[5];
1703
485k
    bf1[5]  = -bf0[5] + bf0[4];
1704
485k
    bf1[6]  = -bf0[6] + bf0[7];
1705
485k
    bf1[7]  = bf0[7] + bf0[6];
1706
485k
    bf1[8]  = bf0[8];
1707
485k
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1708
485k
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1709
485k
    bf1[11] = bf0[11];
1710
485k
    bf1[12] = bf0[12];
1711
485k
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1712
485k
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1713
485k
    bf1[15] = bf0[15];
1714
485k
    bf1[16] = bf0[16] + bf0[19];
1715
485k
    bf1[17] = bf0[17] + bf0[18];
1716
485k
    bf1[18] = -bf0[18] + bf0[17];
1717
485k
    bf1[19] = -bf0[19] + bf0[16];
1718
485k
    bf1[20] = -bf0[20] + bf0[23];
1719
485k
    bf1[21] = -bf0[21] + bf0[22];
1720
485k
    bf1[22] = bf0[22] + bf0[21];
1721
485k
    bf1[23] = bf0[23] + bf0[20];
1722
485k
    bf1[24] = bf0[24] + bf0[27];
1723
485k
    bf1[25] = bf0[25] + bf0[26];
1724
485k
    bf1[26] = -bf0[26] + bf0[25];
1725
485k
    bf1[27] = -bf0[27] + bf0[24];
1726
485k
    bf1[28] = -bf0[28] + bf0[31];
1727
485k
    bf1[29] = -bf0[29] + bf0[30];
1728
485k
    bf1[30] = bf0[30] + bf0[29];
1729
485k
    bf1[31] = bf0[31] + bf0[28];
1730
485k
    bf1[32] = bf0[32];
1731
485k
    bf1[33] = bf0[33];
1732
485k
    bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1733
485k
    bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1734
485k
    bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1735
485k
    bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1736
485k
    bf1[38] = bf0[38];
1737
485k
    bf1[39] = bf0[39];
1738
485k
    bf1[40] = bf0[40];
1739
485k
    bf1[41] = bf0[41];
1740
485k
    bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1741
485k
    bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1742
485k
    bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1743
485k
    bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1744
485k
    bf1[46] = bf0[46];
1745
485k
    bf1[47] = bf0[47];
1746
485k
    bf1[48] = bf0[48];
1747
485k
    bf1[49] = bf0[49];
1748
485k
    bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1749
485k
    bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1750
485k
    bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1751
485k
    bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1752
485k
    bf1[54] = bf0[54];
1753
485k
    bf1[55] = bf0[55];
1754
485k
    bf1[56] = bf0[56];
1755
485k
    bf1[57] = bf0[57];
1756
485k
    bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1757
485k
    bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1758
485k
    bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1759
485k
    bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1760
485k
    bf1[62] = bf0[62];
1761
485k
    bf1[63] = bf0[63];
1762
1763
    // stage 7
1764
485k
    cospi   = cospi_arr(cos_bit);
1765
485k
    bf0     = step;
1766
485k
    bf1     = output;
1767
485k
    bf1[0]  = bf0[0];
1768
485k
    bf1[1]  = bf0[1];
1769
485k
    bf1[2]  = bf0[2];
1770
485k
    bf1[3]  = bf0[3];
1771
485k
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1772
485k
    bf1[5]  = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1773
485k
    bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1774
485k
    bf1[7]  = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1775
485k
    bf1[8]  = bf0[8] + bf0[9];
1776
485k
    bf1[9]  = -bf0[9] + bf0[8];
1777
485k
    bf1[10] = -bf0[10] + bf0[11];
1778
485k
    bf1[11] = bf0[11] + bf0[10];
1779
485k
    bf1[12] = bf0[12] + bf0[13];
1780
485k
    bf1[13] = -bf0[13] + bf0[12];
1781
485k
    bf1[14] = -bf0[14] + bf0[15];
1782
485k
    bf1[15] = bf0[15] + bf0[14];
1783
485k
    bf1[16] = bf0[16];
1784
485k
    bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1785
485k
    bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1786
485k
    bf1[19] = bf0[19];
1787
485k
    bf1[20] = bf0[20];
1788
485k
    bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1789
485k
    bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1790
485k
    bf1[23] = bf0[23];
1791
485k
    bf1[24] = bf0[24];
1792
485k
    bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1793
485k
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1794
485k
    bf1[27] = bf0[27];
1795
485k
    bf1[28] = bf0[28];
1796
485k
    bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1797
485k
    bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1798
485k
    bf1[31] = bf0[31];
1799
485k
    bf1[32] = bf0[32] + bf0[35];
1800
485k
    bf1[33] = bf0[33] + bf0[34];
1801
485k
    bf1[34] = -bf0[34] + bf0[33];
1802
485k
    bf1[35] = -bf0[35] + bf0[32];
1803
485k
    bf1[36] = -bf0[36] + bf0[39];
1804
485k
    bf1[37] = -bf0[37] + bf0[38];
1805
485k
    bf1[38] = bf0[38] + bf0[37];
1806
485k
    bf1[39] = bf0[39] + bf0[36];
1807
485k
    bf1[40] = bf0[40] + bf0[43];
1808
485k
    bf1[41] = bf0[41] + bf0[42];
1809
485k
    bf1[42] = -bf0[42] + bf0[41];
1810
485k
    bf1[43] = -bf0[43] + bf0[40];
1811
485k
    bf1[44] = -bf0[44] + bf0[47];
1812
485k
    bf1[45] = -bf0[45] + bf0[46];
1813
485k
    bf1[46] = bf0[46] + bf0[45];
1814
485k
    bf1[47] = bf0[47] + bf0[44];
1815
485k
    bf1[48] = bf0[48] + bf0[51];
1816
485k
    bf1[49] = bf0[49] + bf0[50];
1817
485k
    bf1[50] = -bf0[50] + bf0[49];
1818
485k
    bf1[51] = -bf0[51] + bf0[48];
1819
485k
    bf1[52] = -bf0[52] + bf0[55];
1820
485k
    bf1[53] = -bf0[53] + bf0[54];
1821
485k
    bf1[54] = bf0[54] + bf0[53];
1822
485k
    bf1[55] = bf0[55] + bf0[52];
1823
485k
    bf1[56] = bf0[56] + bf0[59];
1824
485k
    bf1[57] = bf0[57] + bf0[58];
1825
485k
    bf1[58] = -bf0[58] + bf0[57];
1826
485k
    bf1[59] = -bf0[59] + bf0[56];
1827
485k
    bf1[60] = -bf0[60] + bf0[63];
1828
485k
    bf1[61] = -bf0[61] + bf0[62];
1829
485k
    bf1[62] = bf0[62] + bf0[61];
1830
485k
    bf1[63] = bf0[63] + bf0[60];
1831
1832
    // stage 8
1833
485k
    cospi   = cospi_arr(cos_bit);
1834
485k
    bf0     = output;
1835
485k
    bf1     = step;
1836
485k
    bf1[0]  = bf0[0];
1837
485k
    bf1[1]  = bf0[1];
1838
485k
    bf1[2]  = bf0[2];
1839
485k
    bf1[3]  = bf0[3];
1840
485k
    bf1[4]  = bf0[4];
1841
485k
    bf1[5]  = bf0[5];
1842
485k
    bf1[6]  = bf0[6];
1843
485k
    bf1[7]  = bf0[7];
1844
485k
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1845
485k
    bf1[9]  = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1846
485k
    bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1847
485k
    bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1848
485k
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1849
485k
    bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1850
485k
    bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1851
485k
    bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1852
485k
    bf1[16] = bf0[16] + bf0[17];
1853
485k
    bf1[17] = -bf0[17] + bf0[16];
1854
485k
    bf1[18] = -bf0[18] + bf0[19];
1855
485k
    bf1[19] = bf0[19] + bf0[18];
1856
485k
    bf1[20] = bf0[20] + bf0[21];
1857
485k
    bf1[21] = -bf0[21] + bf0[20];
1858
485k
    bf1[22] = -bf0[22] + bf0[23];
1859
485k
    bf1[23] = bf0[23] + bf0[22];
1860
485k
    bf1[24] = bf0[24] + bf0[25];
1861
485k
    bf1[25] = -bf0[25] + bf0[24];
1862
485k
    bf1[26] = -bf0[26] + bf0[27];
1863
485k
    bf1[27] = bf0[27] + bf0[26];
1864
485k
    bf1[28] = bf0[28] + bf0[29];
1865
485k
    bf1[29] = -bf0[29] + bf0[28];
1866
485k
    bf1[30] = -bf0[30] + bf0[31];
1867
485k
    bf1[31] = bf0[31] + bf0[30];
1868
485k
    bf1[32] = bf0[32];
1869
485k
    bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1870
485k
    bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1871
485k
    bf1[35] = bf0[35];
1872
485k
    bf1[36] = bf0[36];
1873
485k
    bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1874
485k
    bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1875
485k
    bf1[39] = bf0[39];
1876
485k
    bf1[40] = bf0[40];
1877
485k
    bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1878
485k
    bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1879
485k
    bf1[43] = bf0[43];
1880
485k
    bf1[44] = bf0[44];
1881
485k
    bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1882
485k
    bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1883
485k
    bf1[47] = bf0[47];
1884
485k
    bf1[48] = bf0[48];
1885
485k
    bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1886
485k
    bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1887
485k
    bf1[51] = bf0[51];
1888
485k
    bf1[52] = bf0[52];
1889
485k
    bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1890
485k
    bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1891
485k
    bf1[55] = bf0[55];
1892
485k
    bf1[56] = bf0[56];
1893
485k
    bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1894
485k
    bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1895
485k
    bf1[59] = bf0[59];
1896
485k
    bf1[60] = bf0[60];
1897
485k
    bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1898
485k
    bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1899
485k
    bf1[63] = bf0[63];
1900
1901
    // stage 9
1902
485k
    cospi   = cospi_arr(cos_bit);
1903
485k
    bf0     = step;
1904
485k
    bf1     = output;
1905
485k
    bf1[0]  = bf0[0];
1906
485k
    bf1[1]  = bf0[1];
1907
485k
    bf1[2]  = bf0[2];
1908
485k
    bf1[3]  = bf0[3];
1909
485k
    bf1[4]  = bf0[4];
1910
485k
    bf1[5]  = bf0[5];
1911
485k
    bf1[6]  = bf0[6];
1912
485k
    bf1[7]  = bf0[7];
1913
485k
    bf1[8]  = bf0[8];
1914
485k
    bf1[9]  = bf0[9];
1915
485k
    bf1[10] = bf0[10];
1916
485k
    bf1[11] = bf0[11];
1917
485k
    bf1[12] = bf0[12];
1918
485k
    bf1[13] = bf0[13];
1919
485k
    bf1[14] = bf0[14];
1920
485k
    bf1[15] = bf0[15];
1921
485k
    bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1922
485k
    bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1923
485k
    bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1924
485k
    bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1925
485k
    bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1926
485k
    bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1927
485k
    bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1928
485k
    bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1929
485k
    bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1930
485k
    bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1931
485k
    bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1932
485k
    bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1933
485k
    bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1934
485k
    bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1935
485k
    bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1936
485k
    bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1937
485k
    bf1[32] = bf0[32] + bf0[33];
1938
485k
    bf1[33] = -bf0[33] + bf0[32];
1939
485k
    bf1[34] = -bf0[34] + bf0[35];
1940
485k
    bf1[35] = bf0[35] + bf0[34];
1941
485k
    bf1[36] = bf0[36] + bf0[37];
1942
485k
    bf1[37] = -bf0[37] + bf0[36];
1943
485k
    bf1[38] = -bf0[38] + bf0[39];
1944
485k
    bf1[39] = bf0[39] + bf0[38];
1945
485k
    bf1[40] = bf0[40] + bf0[41];
1946
485k
    bf1[41] = -bf0[41] + bf0[40];
1947
485k
    bf1[42] = -bf0[42] + bf0[43];
1948
485k
    bf1[43] = bf0[43] + bf0[42];
1949
485k
    bf1[44] = bf0[44] + bf0[45];
1950
485k
    bf1[45] = -bf0[45] + bf0[44];
1951
485k
    bf1[46] = -bf0[46] + bf0[47];
1952
485k
    bf1[47] = bf0[47] + bf0[46];
1953
485k
    bf1[48] = bf0[48] + bf0[49];
1954
485k
    bf1[49] = -bf0[49] + bf0[48];
1955
485k
    bf1[50] = -bf0[50] + bf0[51];
1956
485k
    bf1[51] = bf0[51] + bf0[50];
1957
485k
    bf1[52] = bf0[52] + bf0[53];
1958
485k
    bf1[53] = -bf0[53] + bf0[52];
1959
485k
    bf1[54] = -bf0[54] + bf0[55];
1960
485k
    bf1[55] = bf0[55] + bf0[54];
1961
485k
    bf1[56] = bf0[56] + bf0[57];
1962
485k
    bf1[57] = -bf0[57] + bf0[56];
1963
485k
    bf1[58] = -bf0[58] + bf0[59];
1964
485k
    bf1[59] = bf0[59] + bf0[58];
1965
485k
    bf1[60] = bf0[60] + bf0[61];
1966
485k
    bf1[61] = -bf0[61] + bf0[60];
1967
485k
    bf1[62] = -bf0[62] + bf0[63];
1968
485k
    bf1[63] = bf0[63] + bf0[62];
1969
1970
    // stage 10
1971
485k
    cospi   = cospi_arr(cos_bit);
1972
485k
    bf0     = output;
1973
485k
    bf1     = step;
1974
485k
    bf1[0]  = bf0[0];
1975
485k
    bf1[1]  = bf0[1];
1976
485k
    bf1[2]  = bf0[2];
1977
485k
    bf1[3]  = bf0[3];
1978
485k
    bf1[4]  = bf0[4];
1979
485k
    bf1[5]  = bf0[5];
1980
485k
    bf1[6]  = bf0[6];
1981
485k
    bf1[7]  = bf0[7];
1982
485k
    bf1[8]  = bf0[8];
1983
485k
    bf1[9]  = bf0[9];
1984
485k
    bf1[10] = bf0[10];
1985
485k
    bf1[11] = bf0[11];
1986
485k
    bf1[12] = bf0[12];
1987
485k
    bf1[13] = bf0[13];
1988
485k
    bf1[14] = bf0[14];
1989
485k
    bf1[15] = bf0[15];
1990
485k
    bf1[16] = bf0[16];
1991
485k
    bf1[17] = bf0[17];
1992
485k
    bf1[18] = bf0[18];
1993
485k
    bf1[19] = bf0[19];
1994
485k
    bf1[20] = bf0[20];
1995
485k
    bf1[21] = bf0[21];
1996
485k
    bf1[22] = bf0[22];
1997
485k
    bf1[23] = bf0[23];
1998
485k
    bf1[24] = bf0[24];
1999
485k
    bf1[25] = bf0[25];
2000
485k
    bf1[26] = bf0[26];
2001
485k
    bf1[27] = bf0[27];
2002
485k
    bf1[28] = bf0[28];
2003
485k
    bf1[29] = bf0[29];
2004
485k
    bf1[30] = bf0[30];
2005
485k
    bf1[31] = bf0[31];
2006
485k
    bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
2007
485k
    bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
2008
485k
    bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
2009
485k
    bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
2010
485k
    bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
2011
485k
    bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
2012
485k
    bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
2013
485k
    bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
2014
485k
    bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
2015
485k
    bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
2016
485k
    bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
2017
485k
    bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
2018
485k
    bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
2019
485k
    bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
2020
485k
    bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
2021
485k
    bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
2022
485k
    bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
2023
485k
    bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
2024
485k
    bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
2025
485k
    bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
2026
485k
    bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
2027
485k
    bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
2028
485k
    bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
2029
485k
    bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
2030
485k
    bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
2031
485k
    bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
2032
485k
    bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
2033
485k
    bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
2034
485k
    bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
2035
485k
    bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
2036
485k
    bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
2037
485k
    bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
2038
2039
    // stage 11
2040
485k
    bf0     = step;
2041
485k
    bf1     = output;
2042
485k
    bf1[0]  = bf0[0];
2043
485k
    bf1[1]  = bf0[32];
2044
485k
    bf1[2]  = bf0[16];
2045
485k
    bf1[3]  = bf0[48];
2046
485k
    bf1[4]  = bf0[8];
2047
485k
    bf1[5]  = bf0[40];
2048
485k
    bf1[6]  = bf0[24];
2049
485k
    bf1[7]  = bf0[56];
2050
485k
    bf1[8]  = bf0[4];
2051
485k
    bf1[9]  = bf0[36];
2052
485k
    bf1[10] = bf0[20];
2053
485k
    bf1[11] = bf0[52];
2054
485k
    bf1[12] = bf0[12];
2055
485k
    bf1[13] = bf0[44];
2056
485k
    bf1[14] = bf0[28];
2057
485k
    bf1[15] = bf0[60];
2058
485k
    bf1[16] = bf0[2];
2059
485k
    bf1[17] = bf0[34];
2060
485k
    bf1[18] = bf0[18];
2061
485k
    bf1[19] = bf0[50];
2062
485k
    bf1[20] = bf0[10];
2063
485k
    bf1[21] = bf0[42];
2064
485k
    bf1[22] = bf0[26];
2065
485k
    bf1[23] = bf0[58];
2066
485k
    bf1[24] = bf0[6];
2067
485k
    bf1[25] = bf0[38];
2068
485k
    bf1[26] = bf0[22];
2069
485k
    bf1[27] = bf0[54];
2070
485k
    bf1[28] = bf0[14];
2071
485k
    bf1[29] = bf0[46];
2072
485k
    bf1[30] = bf0[30];
2073
485k
    bf1[31] = bf0[62];
2074
485k
    bf1[32] = bf0[1];
2075
485k
    bf1[33] = bf0[33];
2076
485k
    bf1[34] = bf0[17];
2077
485k
    bf1[35] = bf0[49];
2078
485k
    bf1[36] = bf0[9];
2079
485k
    bf1[37] = bf0[41];
2080
485k
    bf1[38] = bf0[25];
2081
485k
    bf1[39] = bf0[57];
2082
485k
    bf1[40] = bf0[5];
2083
485k
    bf1[41] = bf0[37];
2084
485k
    bf1[42] = bf0[21];
2085
485k
    bf1[43] = bf0[53];
2086
485k
    bf1[44] = bf0[13];
2087
485k
    bf1[45] = bf0[45];
2088
485k
    bf1[46] = bf0[29];
2089
485k
    bf1[47] = bf0[61];
2090
485k
    bf1[48] = bf0[3];
2091
485k
    bf1[49] = bf0[35];
2092
485k
    bf1[50] = bf0[19];
2093
485k
    bf1[51] = bf0[51];
2094
485k
    bf1[52] = bf0[11];
2095
485k
    bf1[53] = bf0[43];
2096
485k
    bf1[54] = bf0[27];
2097
485k
    bf1[55] = bf0[59];
2098
485k
    bf1[56] = bf0[7];
2099
485k
    bf1[57] = bf0[39];
2100
485k
    bf1[58] = bf0[23];
2101
485k
    bf1[59] = bf0[55];
2102
485k
    bf1[60] = bf0[15];
2103
485k
    bf1[61] = bf0[47];
2104
485k
    bf1[62] = bf0[31];
2105
485k
    bf1[63] = bf0[63];
2106
485k
}
2107
2108
0
void svt_av1_fadst4_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2109
0
    (void)stage_range;
2110
0
    int32_t        bit   = cos_bit;
2111
0
    const int32_t* sinpi = sinpi_arr(bit);
2112
0
    int32_t        x0, x1, x2, x3;
2113
0
    int32_t        s0, s1, s2, s3, s4, s5, s6, s7;
2114
2115
    // stage 0
2116
0
    x0 = input[0];
2117
0
    x1 = input[1];
2118
0
    x2 = input[2];
2119
0
    x3 = input[3];
2120
2121
0
    if (!(x0 | x1 | x2 | x3)) {
2122
0
        output[0] = output[1] = output[2] = output[3] = 0;
2123
0
        return;
2124
0
    }
2125
2126
    //// stage 1
2127
    //s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
2128
    //s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
2129
    //s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
2130
    //s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
2131
    //s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
2132
    //s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
2133
    //s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
2134
    //s7 = range_check_value(x0 + x1, stage_range[1]);
2135
2136
    //// stage 2
2137
    //s7 = range_check_value(s7 - x3, stage_range[2]);
2138
2139
    //// stage 3
2140
    //x0 = range_check_value(s0 + s2, bit + stage_range[3]);
2141
    //x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
2142
    //x2 = range_check_value(s1 - s3, bit + stage_range[3]);
2143
    //x3 = range_check_value(s4, bit + stage_range[3]);
2144
2145
    //// stage 4
2146
    //x0 = range_check_value(x0 + s5, bit + stage_range[4]);
2147
    //x2 = range_check_value(x2 + s6, bit + stage_range[4]);
2148
2149
    //// stage 5
2150
    //s0 = range_check_value(x0 + x3, bit + stage_range[5]);
2151
    //s1 = range_check_value(x1, bit + stage_range[5]);
2152
    //s2 = range_check_value(x2 - x3, bit + stage_range[5]);
2153
    //s3 = range_check_value(x2 - x0, bit + stage_range[5]);
2154
2155
    //// stage 6
2156
    //s3 = range_check_value(s3 + x3, bit + stage_range[6]);
2157
2158
    // stage 1
2159
0
    s0 = sinpi[1] * x0;
2160
0
    s1 = sinpi[4] * x0;
2161
0
    s2 = sinpi[2] * x1;
2162
0
    s3 = sinpi[1] * x1;
2163
0
    s4 = sinpi[3] * x2;
2164
0
    s5 = sinpi[4] * x3;
2165
0
    s6 = sinpi[2] * x3;
2166
0
    s7 = x0 + x1;
2167
2168
    // stage 2
2169
0
    s7 = s7 - x3;
2170
2171
    // stage 3
2172
0
    x0 = s0 + s2;
2173
0
    x1 = sinpi[3] * s7;
2174
0
    x2 = s1 - s3;
2175
0
    x3 = s4;
2176
2177
    // stage 4
2178
0
    x0 = x0 + s5;
2179
0
    x2 = x2 + s6;
2180
2181
    // stage 5
2182
0
    s0 = x0 + x3;
2183
0
    s1 = x1;
2184
0
    s2 = x2 - x3;
2185
0
    s3 = x2 - x0;
2186
2187
    // stage 6
2188
0
    s3 = s3 + x3;
2189
2190
    // 1-D transform scaling factor is sqrt(2).
2191
0
    output[0] = round_shift(s0, bit);
2192
0
    output[1] = round_shift(s1, bit);
2193
0
    output[2] = round_shift(s2, bit);
2194
0
    output[3] = round_shift(s3, bit);
2195
0
}
2196
2197
0
void svt_av1_fadst8_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2198
0
    (void)stage_range;
2199
0
    const int32_t* cospi;
2200
2201
0
    int32_t *bf0, *bf1;
2202
0
    int32_t  step[8];
2203
2204
    // stage 0;
2205
2206
    // stage 1;
2207
0
    assert(output != input);
2208
0
    bf1    = output;
2209
0
    bf1[0] = input[0];
2210
0
    bf1[1] = -input[7];
2211
0
    bf1[2] = -input[3];
2212
0
    bf1[3] = input[4];
2213
0
    bf1[4] = -input[1];
2214
0
    bf1[5] = input[6];
2215
0
    bf1[6] = input[2];
2216
0
    bf1[7] = -input[5];
2217
2218
    // stage 2
2219
0
    cospi  = cospi_arr(cos_bit);
2220
0
    bf0    = output;
2221
0
    bf1    = step;
2222
0
    bf1[0] = bf0[0];
2223
0
    bf1[1] = bf0[1];
2224
0
    bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
2225
0
    bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
2226
0
    bf1[4] = bf0[4];
2227
0
    bf1[5] = bf0[5];
2228
0
    bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
2229
0
    bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
2230
2231
    // stage 3
2232
0
    bf0    = step;
2233
0
    bf1    = output;
2234
0
    bf1[0] = bf0[0] + bf0[2];
2235
0
    bf1[1] = bf0[1] + bf0[3];
2236
0
    bf1[2] = bf0[0] - bf0[2];
2237
0
    bf1[3] = bf0[1] - bf0[3];
2238
0
    bf1[4] = bf0[4] + bf0[6];
2239
0
    bf1[5] = bf0[5] + bf0[7];
2240
0
    bf1[6] = bf0[4] - bf0[6];
2241
0
    bf1[7] = bf0[5] - bf0[7];
2242
2243
    // stage 4
2244
0
    cospi  = cospi_arr(cos_bit);
2245
0
    bf0    = output;
2246
0
    bf1    = step;
2247
0
    bf1[0] = bf0[0];
2248
0
    bf1[1] = bf0[1];
2249
0
    bf1[2] = bf0[2];
2250
0
    bf1[3] = bf0[3];
2251
0
    bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
2252
0
    bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
2253
0
    bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
2254
0
    bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
2255
2256
    // stage 5
2257
0
    bf0    = step;
2258
0
    bf1    = output;
2259
0
    bf1[0] = bf0[0] + bf0[4];
2260
0
    bf1[1] = bf0[1] + bf0[5];
2261
0
    bf1[2] = bf0[2] + bf0[6];
2262
0
    bf1[3] = bf0[3] + bf0[7];
2263
0
    bf1[4] = bf0[0] - bf0[4];
2264
0
    bf1[5] = bf0[1] - bf0[5];
2265
0
    bf1[6] = bf0[2] - bf0[6];
2266
0
    bf1[7] = bf0[3] - bf0[7];
2267
2268
    // stage 6
2269
0
    cospi  = cospi_arr(cos_bit);
2270
0
    bf0    = output;
2271
0
    bf1    = step;
2272
0
    bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
2273
0
    bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
2274
0
    bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
2275
0
    bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
2276
0
    bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
2277
0
    bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
2278
0
    bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
2279
0
    bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
2280
2281
    // stage 7
2282
0
    bf0    = step;
2283
0
    bf1    = output;
2284
0
    bf1[0] = bf0[1];
2285
0
    bf1[1] = bf0[6];
2286
0
    bf1[2] = bf0[3];
2287
0
    bf1[3] = bf0[4];
2288
0
    bf1[4] = bf0[5];
2289
0
    bf1[5] = bf0[2];
2290
0
    bf1[6] = bf0[7];
2291
0
    bf1[7] = bf0[0];
2292
0
}
2293
2294
0
void svt_av1_fadst16_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2295
0
    (void)stage_range;
2296
0
    const int32_t* cospi;
2297
2298
0
    int32_t *bf0, *bf1;
2299
0
    int32_t  step[16];
2300
2301
    // stage 0;
2302
2303
    // stage 1;
2304
0
    assert(output != input);
2305
0
    bf1     = output;
2306
0
    bf1[0]  = input[0];
2307
0
    bf1[1]  = -input[15];
2308
0
    bf1[2]  = -input[7];
2309
0
    bf1[3]  = input[8];
2310
0
    bf1[4]  = -input[3];
2311
0
    bf1[5]  = input[12];
2312
0
    bf1[6]  = input[4];
2313
0
    bf1[7]  = -input[11];
2314
0
    bf1[8]  = -input[1];
2315
0
    bf1[9]  = input[14];
2316
0
    bf1[10] = input[6];
2317
0
    bf1[11] = -input[9];
2318
0
    bf1[12] = input[2];
2319
0
    bf1[13] = -input[13];
2320
0
    bf1[14] = -input[5];
2321
0
    bf1[15] = input[10];
2322
2323
    // stage 2
2324
0
    cospi   = cospi_arr(cos_bit);
2325
0
    bf0     = output;
2326
0
    bf1     = step;
2327
0
    bf1[0]  = bf0[0];
2328
0
    bf1[1]  = bf0[1];
2329
0
    bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
2330
0
    bf1[3]  = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
2331
0
    bf1[4]  = bf0[4];
2332
0
    bf1[5]  = bf0[5];
2333
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
2334
0
    bf1[7]  = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
2335
0
    bf1[8]  = bf0[8];
2336
0
    bf1[9]  = bf0[9];
2337
0
    bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
2338
0
    bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
2339
0
    bf1[12] = bf0[12];
2340
0
    bf1[13] = bf0[13];
2341
0
    bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
2342
0
    bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
2343
2344
    // stage 3
2345
0
    bf0     = step;
2346
0
    bf1     = output;
2347
0
    bf1[0]  = bf0[0] + bf0[2];
2348
0
    bf1[1]  = bf0[1] + bf0[3];
2349
0
    bf1[2]  = bf0[0] - bf0[2];
2350
0
    bf1[3]  = bf0[1] - bf0[3];
2351
0
    bf1[4]  = bf0[4] + bf0[6];
2352
0
    bf1[5]  = bf0[5] + bf0[7];
2353
0
    bf1[6]  = bf0[4] - bf0[6];
2354
0
    bf1[7]  = bf0[5] - bf0[7];
2355
0
    bf1[8]  = bf0[8] + bf0[10];
2356
0
    bf1[9]  = bf0[9] + bf0[11];
2357
0
    bf1[10] = bf0[8] - bf0[10];
2358
0
    bf1[11] = bf0[9] - bf0[11];
2359
0
    bf1[12] = bf0[12] + bf0[14];
2360
0
    bf1[13] = bf0[13] + bf0[15];
2361
0
    bf1[14] = bf0[12] - bf0[14];
2362
0
    bf1[15] = bf0[13] - bf0[15];
2363
2364
    // stage 4
2365
0
    cospi   = cospi_arr(cos_bit);
2366
0
    bf0     = output;
2367
0
    bf1     = step;
2368
0
    bf1[0]  = bf0[0];
2369
0
    bf1[1]  = bf0[1];
2370
0
    bf1[2]  = bf0[2];
2371
0
    bf1[3]  = bf0[3];
2372
0
    bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
2373
0
    bf1[5]  = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
2374
0
    bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
2375
0
    bf1[7]  = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
2376
0
    bf1[8]  = bf0[8];
2377
0
    bf1[9]  = bf0[9];
2378
0
    bf1[10] = bf0[10];
2379
0
    bf1[11] = bf0[11];
2380
0
    bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
2381
0
    bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
2382
0
    bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
2383
0
    bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
2384
2385
    // stage 5
2386
0
    bf0     = step;
2387
0
    bf1     = output;
2388
0
    bf1[0]  = bf0[0] + bf0[4];
2389
0
    bf1[1]  = bf0[1] + bf0[5];
2390
0
    bf1[2]  = bf0[2] + bf0[6];
2391
0
    bf1[3]  = bf0[3] + bf0[7];
2392
0
    bf1[4]  = bf0[0] - bf0[4];
2393
0
    bf1[5]  = bf0[1] - bf0[5];
2394
0
    bf1[6]  = bf0[2] - bf0[6];
2395
0
    bf1[7]  = bf0[3] - bf0[7];
2396
0
    bf1[8]  = bf0[8] + bf0[12];
2397
0
    bf1[9]  = bf0[9] + bf0[13];
2398
0
    bf1[10] = bf0[10] + bf0[14];
2399
0
    bf1[11] = bf0[11] + bf0[15];
2400
0
    bf1[12] = bf0[8] - bf0[12];
2401
0
    bf1[13] = bf0[9] - bf0[13];
2402
0
    bf1[14] = bf0[10] - bf0[14];
2403
0
    bf1[15] = bf0[11] - bf0[15];
2404
2405
    // stage 6
2406
0
    cospi   = cospi_arr(cos_bit);
2407
0
    bf0     = output;
2408
0
    bf1     = step;
2409
0
    bf1[0]  = bf0[0];
2410
0
    bf1[1]  = bf0[1];
2411
0
    bf1[2]  = bf0[2];
2412
0
    bf1[3]  = bf0[3];
2413
0
    bf1[4]  = bf0[4];
2414
0
    bf1[5]  = bf0[5];
2415
0
    bf1[6]  = bf0[6];
2416
0
    bf1[7]  = bf0[7];
2417
0
    bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
2418
0
    bf1[9]  = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
2419
0
    bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
2420
0
    bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
2421
0
    bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
2422
0
    bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
2423
0
    bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
2424
0
    bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
2425
2426
    // stage 7
2427
0
    bf0     = step;
2428
0
    bf1     = output;
2429
0
    bf1[0]  = bf0[0] + bf0[8];
2430
0
    bf1[1]  = bf0[1] + bf0[9];
2431
0
    bf1[2]  = bf0[2] + bf0[10];
2432
0
    bf1[3]  = bf0[3] + bf0[11];
2433
0
    bf1[4]  = bf0[4] + bf0[12];
2434
0
    bf1[5]  = bf0[5] + bf0[13];
2435
0
    bf1[6]  = bf0[6] + bf0[14];
2436
0
    bf1[7]  = bf0[7] + bf0[15];
2437
0
    bf1[8]  = bf0[0] - bf0[8];
2438
0
    bf1[9]  = bf0[1] - bf0[9];
2439
0
    bf1[10] = bf0[2] - bf0[10];
2440
0
    bf1[11] = bf0[3] - bf0[11];
2441
0
    bf1[12] = bf0[4] - bf0[12];
2442
0
    bf1[13] = bf0[5] - bf0[13];
2443
0
    bf1[14] = bf0[6] - bf0[14];
2444
0
    bf1[15] = bf0[7] - bf0[15];
2445
2446
    // stage 8
2447
0
    cospi   = cospi_arr(cos_bit);
2448
0
    bf0     = output;
2449
0
    bf1     = step;
2450
0
    bf1[0]  = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
2451
0
    bf1[1]  = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
2452
0
    bf1[2]  = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
2453
0
    bf1[3]  = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
2454
0
    bf1[4]  = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
2455
0
    bf1[5]  = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
2456
0
    bf1[6]  = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
2457
0
    bf1[7]  = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
2458
0
    bf1[8]  = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
2459
0
    bf1[9]  = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
2460
0
    bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
2461
0
    bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
2462
0
    bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
2463
0
    bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
2464
0
    bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
2465
0
    bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
2466
2467
    // stage 9
2468
0
    bf0     = step;
2469
0
    bf1     = output;
2470
0
    bf1[0]  = bf0[1];
2471
0
    bf1[1]  = bf0[14];
2472
0
    bf1[2]  = bf0[3];
2473
0
    bf1[3]  = bf0[12];
2474
0
    bf1[4]  = bf0[5];
2475
0
    bf1[5]  = bf0[10];
2476
0
    bf1[6]  = bf0[7];
2477
0
    bf1[7]  = bf0[8];
2478
0
    bf1[8]  = bf0[9];
2479
0
    bf1[9]  = bf0[6];
2480
0
    bf1[10] = bf0[11];
2481
0
    bf1[11] = bf0[4];
2482
0
    bf1[12] = bf0[13];
2483
0
    bf1[13] = bf0[2];
2484
0
    bf1[14] = bf0[15];
2485
0
    bf1[15] = bf0[0];
2486
0
}
2487
2488
0
static void av1_fadst32_new(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2489
0
    (void)stage_range;
2490
0
    const int32_t* cospi;
2491
2492
0
    int32_t *bf0, *bf1;
2493
0
    int32_t  step[32];
2494
2495
    // stage 0;
2496
2497
    // stage 1;
2498
0
    bf1     = output;
2499
0
    bf1[0]  = input[31];
2500
0
    bf1[1]  = input[0];
2501
0
    bf1[2]  = input[29];
2502
0
    bf1[3]  = input[2];
2503
0
    bf1[4]  = input[27];
2504
0
    bf1[5]  = input[4];
2505
0
    bf1[6]  = input[25];
2506
0
    bf1[7]  = input[6];
2507
0
    bf1[8]  = input[23];
2508
0
    bf1[9]  = input[8];
2509
0
    bf1[10] = input[21];
2510
0
    bf1[11] = input[10];
2511
0
    bf1[12] = input[19];
2512
0
    bf1[13] = input[12];
2513
0
    bf1[14] = input[17];
2514
0
    bf1[15] = input[14];
2515
0
    bf1[16] = input[15];
2516
0
    bf1[17] = input[16];
2517
0
    bf1[18] = input[13];
2518
0
    bf1[19] = input[18];
2519
0
    bf1[20] = input[11];
2520
0
    bf1[21] = input[20];
2521
0
    bf1[22] = input[9];
2522
0
    bf1[23] = input[22];
2523
0
    bf1[24] = input[7];
2524
0
    bf1[25] = input[24];
2525
0
    bf1[26] = input[5];
2526
0
    bf1[27] = input[26];
2527
0
    bf1[28] = input[3];
2528
0
    bf1[29] = input[28];
2529
0
    bf1[30] = input[1];
2530
0
    bf1[31] = input[30];
2531
2532
    // stage 2
2533
0
    cospi   = cospi_arr(cos_bit);
2534
0
    bf0     = output;
2535
0
    bf1     = step;
2536
0
    bf1[0]  = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit);
2537
0
    bf1[1]  = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit);
2538
0
    bf1[2]  = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit);
2539
0
    bf1[3]  = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit);
2540
0
    bf1[4]  = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit);
2541
0
    bf1[5]  = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit);
2542
0
    bf1[6]  = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit);
2543
0
    bf1[7]  = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit);
2544
0
    bf1[8]  = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit);
2545
0
    bf1[9]  = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit);
2546
0
    bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit);
2547
0
    bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit);
2548
0
    bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit);
2549
0
    bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit);
2550
0
    bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit);
2551
0
    bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit);
2552
0
    bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit);
2553
0
    bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit);
2554
0
    bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit);
2555
0
    bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit);
2556
0
    bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit);
2557
0
    bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit);
2558
0
    bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit);
2559
0
    bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit);
2560
0
    bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit);
2561
0
    bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit);
2562
0
    bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit);
2563
0
    bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit);
2564
0
    bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit);
2565
0
    bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit);
2566
0
    bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit);
2567
0
    bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit);
2568
2569
    // stage 3
2570
0
    bf0     = step;
2571
0
    bf1     = output;
2572
0
    bf1[0]  = bf0[0] + bf0[16];
2573
0
    bf1[1]  = bf0[1] + bf0[17];
2574
0
    bf1[2]  = bf0[2] + bf0[18];
2575
0
    bf1[3]  = bf0[3] + bf0[19];
2576
0
    bf1[4]  = bf0[4] + bf0[20];
2577
0
    bf1[5]  = bf0[5] + bf0[21];
2578
0
    bf1[6]  = bf0[6] + bf0[22];
2579
0
    bf1[7]  = bf0[7] + bf0[23];
2580
0
    bf1[8]  = bf0[8] + bf0[24];
2581
0
    bf1[9]  = bf0[9] + bf0[25];
2582
0
    bf1[10] = bf0[10] + bf0[26];
2583
0
    bf1[11] = bf0[11] + bf0[27];
2584
0
    bf1[12] = bf0[12] + bf0[28];
2585
0
    bf1[13] = bf0[13] + bf0[29];
2586
0
    bf1[14] = bf0[14] + bf0[30];
2587
0
    bf1[15] = bf0[15] + bf0[31];
2588
0
    bf1[16] = -bf0[16] + bf0[0];
2589
0
    bf1[17] = -bf0[17] + bf0[1];
2590
0
    bf1[18] = -bf0[18] + bf0[2];
2591
0
    bf1[19] = -bf0[19] + bf0[3];
2592
0
    bf1[20] = -bf0[20] + bf0[4];
2593
0
    bf1[21] = -bf0[21] + bf0[5];
2594
0
    bf1[22] = -bf0[22] + bf0[6];
2595
0
    bf1[23] = -bf0[23] + bf0[7];
2596
0
    bf1[24] = -bf0[24] + bf0[8];
2597
0
    bf1[25] = -bf0[25] + bf0[9];
2598
0
    bf1[26] = -bf0[26] + bf0[10];
2599
0
    bf1[27] = -bf0[27] + bf0[11];
2600
0
    bf1[28] = -bf0[28] + bf0[12];
2601
0
    bf1[29] = -bf0[29] + bf0[13];
2602
0
    bf1[30] = -bf0[30] + bf0[14];
2603
0
    bf1[31] = -bf0[31] + bf0[15];
2604
2605
    // stage 4
2606
0
    cospi   = cospi_arr(cos_bit);
2607
0
    bf0     = output;
2608
0
    bf1     = step;
2609
0
    bf1[0]  = bf0[0];
2610
0
    bf1[1]  = bf0[1];
2611
0
    bf1[2]  = bf0[2];
2612
0
    bf1[3]  = bf0[3];
2613
0
    bf1[4]  = bf0[4];
2614
0
    bf1[5]  = bf0[5];
2615
0
    bf1[6]  = bf0[6];
2616
0
    bf1[7]  = bf0[7];
2617
0
    bf1[8]  = bf0[8];
2618
0
    bf1[9]  = bf0[9];
2619
0
    bf1[10] = bf0[10];
2620
0
    bf1[11] = bf0[11];
2621
0
    bf1[12] = bf0[12];
2622
0
    bf1[13] = bf0[13];
2623
0
    bf1[14] = bf0[14];
2624
0
    bf1[15] = bf0[15];
2625
0
    bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit);
2626
0
    bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit);
2627
0
    bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit);
2628
0
    bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit);
2629
0
    bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit);
2630
0
    bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit);
2631
0
    bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit);
2632
0
    bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit);
2633
0
    bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit);
2634
0
    bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit);
2635
0
    bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit);
2636
0
    bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit);
2637
0
    bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit);
2638
0
    bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit);
2639
0
    bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit);
2640
0
    bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit);
2641
2642
    // stage 5
2643
0
    bf0     = step;
2644
0
    bf1     = output;
2645
0
    bf1[0]  = bf0[0] + bf0[8];
2646
0
    bf1[1]  = bf0[1] + bf0[9];
2647
0
    bf1[2]  = bf0[2] + bf0[10];
2648
0
    bf1[3]  = bf0[3] + bf0[11];
2649
0
    bf1[4]  = bf0[4] + bf0[12];
2650
0
    bf1[5]  = bf0[5] + bf0[13];
2651
0
    bf1[6]  = bf0[6] + bf0[14];
2652
0
    bf1[7]  = bf0[7] + bf0[15];
2653
0
    bf1[8]  = -bf0[8] + bf0[0];
2654
0
    bf1[9]  = -bf0[9] + bf0[1];
2655
0
    bf1[10] = -bf0[10] + bf0[2];
2656
0
    bf1[11] = -bf0[11] + bf0[3];
2657
0
    bf1[12] = -bf0[12] + bf0[4];
2658
0
    bf1[13] = -bf0[13] + bf0[5];
2659
0
    bf1[14] = -bf0[14] + bf0[6];
2660
0
    bf1[15] = -bf0[15] + bf0[7];
2661
0
    bf1[16] = bf0[16] + bf0[24];
2662
0
    bf1[17] = bf0[17] + bf0[25];
2663
0
    bf1[18] = bf0[18] + bf0[26];
2664
0
    bf1[19] = bf0[19] + bf0[27];
2665
0
    bf1[20] = bf0[20] + bf0[28];
2666
0
    bf1[21] = bf0[21] + bf0[29];
2667
0
    bf1[22] = bf0[22] + bf0[30];
2668
0
    bf1[23] = bf0[23] + bf0[31];
2669
0
    bf1[24] = -bf0[24] + bf0[16];
2670
0
    bf1[25] = -bf0[25] + bf0[17];
2671
0
    bf1[26] = -bf0[26] + bf0[18];
2672
0
    bf1[27] = -bf0[27] + bf0[19];
2673
0
    bf1[28] = -bf0[28] + bf0[20];
2674
0
    bf1[29] = -bf0[29] + bf0[21];
2675
0
    bf1[30] = -bf0[30] + bf0[22];
2676
0
    bf1[31] = -bf0[31] + bf0[23];
2677
2678
    // stage 6
2679
0
    cospi   = cospi_arr(cos_bit);
2680
0
    bf0     = output;
2681
0
    bf1     = step;
2682
0
    bf1[0]  = bf0[0];
2683
0
    bf1[1]  = bf0[1];
2684
0
    bf1[2]  = bf0[2];
2685
0
    bf1[3]  = bf0[3];
2686
0
    bf1[4]  = bf0[4];
2687
0
    bf1[5]  = bf0[5];
2688
0
    bf1[6]  = bf0[6];
2689
0
    bf1[7]  = bf0[7];
2690
0
    bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
2691
0
    bf1[9]  = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit);
2692
0
    bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
2693
0
    bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit);
2694
0
    bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
2695
0
    bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit);
2696
0
    bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
2697
0
    bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit);
2698
0
    bf1[16] = bf0[16];
2699
0
    bf1[17] = bf0[17];
2700
0
    bf1[18] = bf0[18];
2701
0
    bf1[19] = bf0[19];
2702
0
    bf1[20] = bf0[20];
2703
0
    bf1[21] = bf0[21];
2704
0
    bf1[22] = bf0[22];
2705
0
    bf1[23] = bf0[23];
2706
0
    bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit);
2707
0
    bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit);
2708
0
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit);
2709
0
    bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit);
2710
0
    bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit);
2711
0
    bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit);
2712
0
    bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit);
2713
0
    bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit);
2714
2715
    // stage 7
2716
0
    bf0     = step;
2717
0
    bf1     = output;
2718
0
    bf1[0]  = bf0[0] + bf0[4];
2719
0
    bf1[1]  = bf0[1] + bf0[5];
2720
0
    bf1[2]  = bf0[2] + bf0[6];
2721
0
    bf1[3]  = bf0[3] + bf0[7];
2722
0
    bf1[4]  = -bf0[4] + bf0[0];
2723
0
    bf1[5]  = -bf0[5] + bf0[1];
2724
0
    bf1[6]  = -bf0[6] + bf0[2];
2725
0
    bf1[7]  = -bf0[7] + bf0[3];
2726
0
    bf1[8]  = bf0[8] + bf0[12];
2727
0
    bf1[9]  = bf0[9] + bf0[13];
2728
0
    bf1[10] = bf0[10] + bf0[14];
2729
0
    bf1[11] = bf0[11] + bf0[15];
2730
0
    bf1[12] = -bf0[12] + bf0[8];
2731
0
    bf1[13] = -bf0[13] + bf0[9];
2732
0
    bf1[14] = -bf0[14] + bf0[10];
2733
0
    bf1[15] = -bf0[15] + bf0[11];
2734
0
    bf1[16] = bf0[16] + bf0[20];
2735
0
    bf1[17] = bf0[17] + bf0[21];
2736
0
    bf1[18] = bf0[18] + bf0[22];
2737
0
    bf1[19] = bf0[19] + bf0[23];
2738
0
    bf1[20] = -bf0[20] + bf0[16];
2739
0
    bf1[21] = -bf0[21] + bf0[17];
2740
0
    bf1[22] = -bf0[22] + bf0[18];
2741
0
    bf1[23] = -bf0[23] + bf0[19];
2742
0
    bf1[24] = bf0[24] + bf0[28];
2743
0
    bf1[25] = bf0[25] + bf0[29];
2744
0
    bf1[26] = bf0[26] + bf0[30];
2745
0
    bf1[27] = bf0[27] + bf0[31];
2746
0
    bf1[28] = -bf0[28] + bf0[24];
2747
0
    bf1[29] = -bf0[29] + bf0[25];
2748
0
    bf1[30] = -bf0[30] + bf0[26];
2749
0
    bf1[31] = -bf0[31] + bf0[27];
2750
2751
    // stage 8
2752
0
    cospi   = cospi_arr(cos_bit);
2753
0
    bf0     = output;
2754
0
    bf1     = step;
2755
0
    bf1[0]  = bf0[0];
2756
0
    bf1[1]  = bf0[1];
2757
0
    bf1[2]  = bf0[2];
2758
0
    bf1[3]  = bf0[3];
2759
0
    bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
2760
0
    bf1[5]  = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit);
2761
0
    bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
2762
0
    bf1[7]  = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit);
2763
0
    bf1[8]  = bf0[8];
2764
0
    bf1[9]  = bf0[9];
2765
0
    bf1[10] = bf0[10];
2766
0
    bf1[11] = bf0[11];
2767
0
    bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
2768
0
    bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit);
2769
0
    bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
2770
0
    bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit);
2771
0
    bf1[16] = bf0[16];
2772
0
    bf1[17] = bf0[17];
2773
0
    bf1[18] = bf0[18];
2774
0
    bf1[19] = bf0[19];
2775
0
    bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit);
2776
0
    bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit);
2777
0
    bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit);
2778
0
    bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit);
2779
0
    bf1[24] = bf0[24];
2780
0
    bf1[25] = bf0[25];
2781
0
    bf1[26] = bf0[26];
2782
0
    bf1[27] = bf0[27];
2783
0
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit);
2784
0
    bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit);
2785
0
    bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit);
2786
0
    bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit);
2787
2788
    // stage 9
2789
0
    bf0     = step;
2790
0
    bf1     = output;
2791
0
    bf1[0]  = bf0[0] + bf0[2];
2792
0
    bf1[1]  = bf0[1] + bf0[3];
2793
0
    bf1[2]  = -bf0[2] + bf0[0];
2794
0
    bf1[3]  = -bf0[3] + bf0[1];
2795
0
    bf1[4]  = bf0[4] + bf0[6];
2796
0
    bf1[5]  = bf0[5] + bf0[7];
2797
0
    bf1[6]  = -bf0[6] + bf0[4];
2798
0
    bf1[7]  = -bf0[7] + bf0[5];
2799
0
    bf1[8]  = bf0[8] + bf0[10];
2800
0
    bf1[9]  = bf0[9] + bf0[11];
2801
0
    bf1[10] = -bf0[10] + bf0[8];
2802
0
    bf1[11] = -bf0[11] + bf0[9];
2803
0
    bf1[12] = bf0[12] + bf0[14];
2804
0
    bf1[13] = bf0[13] + bf0[15];
2805
0
    bf1[14] = -bf0[14] + bf0[12];
2806
0
    bf1[15] = -bf0[15] + bf0[13];
2807
0
    bf1[16] = bf0[16] + bf0[18];
2808
0
    bf1[17] = bf0[17] + bf0[19];
2809
0
    bf1[18] = -bf0[18] + bf0[16];
2810
0
    bf1[19] = -bf0[19] + bf0[17];
2811
0
    bf1[20] = bf0[20] + bf0[22];
2812
0
    bf1[21] = bf0[21] + bf0[23];
2813
0
    bf1[22] = -bf0[22] + bf0[20];
2814
0
    bf1[23] = -bf0[23] + bf0[21];
2815
0
    bf1[24] = bf0[24] + bf0[26];
2816
0
    bf1[25] = bf0[25] + bf0[27];
2817
0
    bf1[26] = -bf0[26] + bf0[24];
2818
0
    bf1[27] = -bf0[27] + bf0[25];
2819
0
    bf1[28] = bf0[28] + bf0[30];
2820
0
    bf1[29] = bf0[29] + bf0[31];
2821
0
    bf1[30] = -bf0[30] + bf0[28];
2822
0
    bf1[31] = -bf0[31] + bf0[29];
2823
2824
    // stage 10
2825
0
    cospi   = cospi_arr(cos_bit);
2826
0
    bf0     = output;
2827
0
    bf1     = step;
2828
0
    bf1[0]  = bf0[0];
2829
0
    bf1[1]  = bf0[1];
2830
0
    bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
2831
0
    bf1[3]  = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit);
2832
0
    bf1[4]  = bf0[4];
2833
0
    bf1[5]  = bf0[5];
2834
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
2835
0
    bf1[7]  = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit);
2836
0
    bf1[8]  = bf0[8];
2837
0
    bf1[9]  = bf0[9];
2838
0
    bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
2839
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit);
2840
0
    bf1[12] = bf0[12];
2841
0
    bf1[13] = bf0[13];
2842
0
    bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
2843
0
    bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit);
2844
0
    bf1[16] = bf0[16];
2845
0
    bf1[17] = bf0[17];
2846
0
    bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit);
2847
0
    bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit);
2848
0
    bf1[20] = bf0[20];
2849
0
    bf1[21] = bf0[21];
2850
0
    bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit);
2851
0
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit);
2852
0
    bf1[24] = bf0[24];
2853
0
    bf1[25] = bf0[25];
2854
0
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit);
2855
0
    bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit);
2856
0
    bf1[28] = bf0[28];
2857
0
    bf1[29] = bf0[29];
2858
0
    bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit);
2859
0
    bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit);
2860
2861
    // stage 11
2862
0
    bf0     = step;
2863
0
    bf1     = output;
2864
0
    bf1[0]  = bf0[0];
2865
0
    bf1[1]  = -bf0[16];
2866
0
    bf1[2]  = bf0[24];
2867
0
    bf1[3]  = -bf0[8];
2868
0
    bf1[4]  = bf0[12];
2869
0
    bf1[5]  = -bf0[28];
2870
0
    bf1[6]  = bf0[20];
2871
0
    bf1[7]  = -bf0[4];
2872
0
    bf1[8]  = bf0[6];
2873
0
    bf1[9]  = -bf0[22];
2874
0
    bf1[10] = bf0[30];
2875
0
    bf1[11] = -bf0[14];
2876
0
    bf1[12] = bf0[10];
2877
0
    bf1[13] = -bf0[26];
2878
0
    bf1[14] = bf0[18];
2879
0
    bf1[15] = -bf0[2];
2880
0
    bf1[16] = bf0[3];
2881
0
    bf1[17] = -bf0[19];
2882
0
    bf1[18] = bf0[27];
2883
0
    bf1[19] = -bf0[11];
2884
0
    bf1[20] = bf0[15];
2885
0
    bf1[21] = -bf0[31];
2886
0
    bf1[22] = bf0[23];
2887
0
    bf1[23] = -bf0[7];
2888
0
    bf1[24] = bf0[5];
2889
0
    bf1[25] = -bf0[21];
2890
0
    bf1[26] = bf0[29];
2891
0
    bf1[27] = -bf0[13];
2892
0
    bf1[28] = bf0[9];
2893
0
    bf1[29] = -bf0[25];
2894
0
    bf1[30] = bf0[17];
2895
0
    bf1[31] = -bf0[1];
2896
0
}
2897
2898
0
void svt_av1_fidentity4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2899
0
    (void)stage_range;
2900
0
    (void)cos_bit;
2901
0
    for (int32_t i = 0; i < 4; ++i) {
2902
0
        output[i] = round_shift((int64_t)input[i] * new_sqrt2, new_sqrt2_bits);
2903
0
    }
2904
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
2905
0
}
2906
2907
0
void svt_av1_fidentity8_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2908
0
    (void)stage_range;
2909
0
    (void)cos_bit;
2910
0
    for (int32_t i = 0; i < 8; ++i) {
2911
0
        output[i] = input[i] * 2;
2912
0
    }
2913
0
}
2914
2915
0
void svt_av1_fidentity16_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2916
0
    (void)stage_range;
2917
0
    (void)cos_bit;
2918
0
    for (int32_t i = 0; i < 16; ++i) {
2919
0
        output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
2920
0
    }
2921
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
2922
0
}
2923
2924
0
void svt_av1_fidentity32_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2925
0
    (void)stage_range;
2926
0
    (void)cos_bit;
2927
0
    for (int32_t i = 0; i < 32; ++i) {
2928
0
        output[i] = input[i] * 4;
2929
0
    }
2930
0
}
2931
2932
0
static void av1_fidentity64_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
2933
0
    (void)stage_range;
2934
0
    (void)cos_bit;
2935
0
    for (int32_t i = 0; i < 64; ++i) {
2936
0
        output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
2937
0
    }
2938
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
2939
0
}
2940
2941
60.3k
TxfmFunc svt_aom_fwd_txfm_type_to_func(TxfmType txfmtype) {
2942
60.3k
    switch (txfmtype) {
2943
11.2k
    case TXFM_TYPE_DCT4:
2944
11.2k
        return svt_av1_fdct4_new;
2945
16.9k
    case TXFM_TYPE_DCT8:
2946
16.9k
        return svt_av1_fdct8_new;
2947
7.82k
    case TXFM_TYPE_DCT16:
2948
7.82k
        return svt_av1_fdct16_new;
2949
15.6k
    case TXFM_TYPE_DCT32:
2950
15.6k
        return svt_av1_fdct32_new;
2951
8.76k
    case TXFM_TYPE_DCT64:
2952
8.76k
        return svt_av1_fdct64_new;
2953
0
    case TXFM_TYPE_ADST4:
2954
0
        return svt_av1_fadst4_new;
2955
0
    case TXFM_TYPE_ADST8:
2956
0
        return svt_av1_fadst8_new;
2957
0
    case TXFM_TYPE_ADST16:
2958
0
        return svt_av1_fadst16_new;
2959
0
    case TXFM_TYPE_ADST32:
2960
0
        return av1_fadst32_new;
2961
0
    case TXFM_TYPE_IDENTITY4:
2962
0
        return svt_av1_fidentity4_c;
2963
0
    case TXFM_TYPE_IDENTITY8:
2964
0
        return svt_av1_fidentity8_c;
2965
0
    case TXFM_TYPE_IDENTITY16:
2966
0
        return svt_av1_fidentity16_c;
2967
0
    case TXFM_TYPE_IDENTITY32:
2968
0
        return svt_av1_fidentity32_c;
2969
0
    case TXFM_TYPE_IDENTITY64:
2970
0
        return av1_fidentity64_c;
2971
0
    default:
2972
0
        assert(0);
2973
0
        return NULL;
2974
60.3k
    }
2975
60.3k
}
2976
2977
//fwd_txfm2d_c
2978
static INLINE void av1_tranform_two_d_core_c(int16_t* input, uint32_t input_stride, int32_t* output,
2979
30.1k
                                             const Txfm2dFlipCfg* cfg, int32_t* buf, uint8_t bit_depth) {
2980
30.1k
    int32_t c, r;
2981
    // Note when assigning txfm_size_col, we use the txfm_size from the
2982
    // row configuration and vice versa. This is intentionally done to
2983
    // accurately perform rectangular transforms. When the transform is
2984
    // rectangular, the number of columns will be the same as the
2985
    // txfm_size stored in the row cfg struct. It will make no difference
2986
    // for square transforms.
2987
30.1k
    const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
2988
30.1k
    const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
2989
    // Take the shift from the larger dimension in the rectangular case.
2990
30.1k
    const int8_t* shift     = cfg->shift;
2991
30.1k
    const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2992
30.1k
    int8_t        stage_range_col[MAX_TXFM_STAGE_NUM];
2993
30.1k
    int8_t        stage_range_row[MAX_TXFM_STAGE_NUM];
2994
30.1k
    assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
2995
30.1k
    assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
2996
30.1k
    svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
2997
2998
30.1k
    const int8_t   cos_bit_col   = cfg->cos_bit_col;
2999
30.1k
    const int8_t   cos_bit_row   = cfg->cos_bit_row;
3000
30.1k
    const TxfmFunc txfm_func_col = svt_aom_fwd_txfm_type_to_func(cfg->txfm_type_col);
3001
30.1k
    const TxfmFunc txfm_func_row = svt_aom_fwd_txfm_type_to_func(cfg->txfm_type_row);
3002
30.1k
    ASSERT(txfm_func_col != NULL);
3003
30.1k
    ASSERT(txfm_func_row != NULL);
3004
    // use output buffer as temp buffer
3005
30.1k
    int32_t* temp_in  = output;
3006
30.1k
    int32_t* temp_out = output + txfm_size_row;
3007
3008
    // Columns
3009
749k
    for (c = 0; c < txfm_size_col; ++c) {
3010
719k
        if (cfg->ud_flip == 0) {
3011
27.0M
            for (r = 0; r < txfm_size_row; ++r) {
3012
26.2M
                temp_in[r] = input[r * input_stride + c];
3013
26.2M
            }
3014
18.4E
        } else {
3015
18.4E
            for (r = 0; r < txfm_size_row; ++r) {
3016
                // flip upside down
3017
0
                temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
3018
0
            }
3019
18.4E
        }
3020
719k
        svt_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
3021
719k
        txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
3022
719k
        svt_av1_round_shift_array_c(temp_out, txfm_size_row, -shift[1]); // NM svt_av1_round_shift_array_c
3023
719k
        if (cfg->lr_flip == 0) {
3024
27.0M
            for (r = 0; r < txfm_size_row; ++r) {
3025
26.3M
                buf[r * txfm_size_col + c] = temp_out[r];
3026
26.3M
            }
3027
18.4E
        } else {
3028
18.4E
            for (r = 0; r < txfm_size_row; ++r) {
3029
                // flip from left to right
3030
0
                buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
3031
0
            }
3032
18.4E
        }
3033
719k
    }
3034
3035
    // Rows
3036
674k
    for (r = 0; r < txfm_size_row; ++r) {
3037
644k
        txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
3038
644k
        svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col, -shift[2]);
3039
3040
644k
        if (abs(rect_type) == 1) {
3041
            // Multiply everything by Sqrt2 if the transform is rectangular and the
3042
            // size difference is a factor of 2.
3043
4.85M
            for (c = 0; c < txfm_size_col; ++c) {
3044
4.78M
                output[r * txfm_size_col + c] = round_shift((int64_t)output[r * txfm_size_col + c] * new_sqrt2,
3045
4.78M
                                                            new_sqrt2_bits);
3046
4.78M
            }
3047
74.7k
        }
3048
644k
    }
3049
30.1k
}
3050
3051
30.1k
static INLINE void set_fwd_txfm_non_scale_range(Txfm2dFlipCfg* cfg) {
3052
30.1k
    av1_zero(cfg->stage_range_col);
3053
30.1k
    av1_zero(cfg->stage_range_row);
3054
3055
30.1k
    if (cfg->txfm_type_col == TXFM_TYPE_INVALID) {
3056
0
        return;
3057
0
    }
3058
3059
30.1k
    const int8_t* range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col];
3060
30.1k
    const int32_t stage_num_col   = MIN(cfg->stage_num_col, MAX_TXFM_STAGE_NUM);
3061
262k
    for (int32_t i = 0; i < stage_num_col; ++i) {
3062
232k
        cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
3063
232k
    }
3064
3065
30.1k
    if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
3066
30.1k
        const int8_t* range_mult2_row = fwd_txfm_range_mult2_list[cfg->txfm_type_row];
3067
30.1k
        const int32_t stage_num_row   = MIN(cfg->stage_num_row, MAX_TXFM_STAGE_NUM);
3068
267k
        for (int32_t i = 0; i < stage_num_row; ++i) {
3069
237k
            cfg->stage_range_row[i] = (range_mult2_col[cfg->stage_num_col - 1] + range_mult2_row[i] + 1) >> 1;
3070
237k
        }
3071
30.1k
    }
3072
30.1k
}
3073
3074
30.1k
void svt_aom_transform_config(TxType tx_type, TxSize tx_size, Txfm2dFlipCfg* cfg) {
3075
30.1k
    assert(cfg != NULL);
3076
30.1k
    cfg->tx_size = tx_size;
3077
30.1k
    set_flip_cfg(tx_type, cfg);
3078
30.1k
    const TxType1D tx_type_1d_col = vtx_tab[tx_type];
3079
30.1k
    const TxType1D tx_type_1d_row = htx_tab[tx_type];
3080
30.1k
    const int32_t  txw_idx        = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
3081
30.1k
    const int32_t  txh_idx        = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
3082
30.1k
    cfg->shift                    = fwd_txfm_shift_ls[tx_size];
3083
30.1k
    cfg->cos_bit_col              = fwd_cos_bit_col[txw_idx][txh_idx];
3084
30.1k
    cfg->cos_bit_row              = fwd_cos_bit_row[txw_idx][txh_idx];
3085
30.1k
    cfg->txfm_type_col            = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
3086
30.1k
    cfg->txfm_type_row            = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
3087
30.1k
    cfg->stage_num_col            = av1_txfm_stage_num_list[cfg->txfm_type_col];
3088
30.1k
    cfg->stage_num_row            = av1_txfm_stage_num_list[cfg->txfm_type_row];
3089
30.1k
    set_fwd_txfm_non_scale_range(cfg);
3090
30.1k
}
3091
3092
8.75k
static uint64_t energy_computation(int32_t* coeff, uint32_t coeff_stride, uint32_t area_width, uint32_t area_height) {
3093
8.75k
    uint64_t prediction_distortion = 0;
3094
3095
288k
    for (uint32_t row_index = 0; row_index < area_height; ++row_index) {
3096
12.4M
        for (uint32_t column_index = 0; column_index < area_width; ++column_index) {
3097
12.2M
            prediction_distortion += (int64_t)SQR((int64_t)(coeff[column_index]));
3098
12.2M
        }
3099
279k
        coeff += coeff_stride;
3100
279k
    }
3101
3102
8.75k
    return prediction_distortion;
3103
8.75k
}
3104
3105
3.21k
uint64_t svt_handle_transform64x64_c(int32_t* output) {
3106
3.21k
    uint64_t three_quad_energy;
3107
3108
    // top - right 32x32 area.
3109
3.21k
    three_quad_energy = energy_computation(output + 32, 64, 32, 32);
3110
    //bottom 64x32 area.
3111
3.21k
    three_quad_energy += energy_computation(output + 32 * 64, 64, 64, 32);
3112
3113
    // Re-pack non-zero coeffs in the first 32x32 indices.
3114
102k
    for (int32_t row = 1; row < 32; ++row) {
3115
99.5k
        svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
3116
99.5k
    }
3117
3118
3.21k
    return three_quad_energy;
3119
3.21k
}
3120
3121
void svt_av1_transform_two_d_64x64_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3122
3.21k
                                     uint8_t bit_depth) {
3123
3.21k
    int32_t       intermediate_transform_buffer[64 * 64];
3124
3.21k
    Txfm2dFlipCfg cfg;
3125
    //av1_get_fwd_txfm_cfg
3126
3.21k
    svt_aom_transform_config(transform_type, TX_64X64, &cfg);
3127
    //fwd_txfm2d_c
3128
3.21k
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3129
3.21k
}
3130
3131
void svt_av1_transform_two_d_32x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3132
6.63k
                                     uint8_t bit_depth) {
3133
6.63k
    int32_t       intermediate_transform_buffer[32 * 32];
3134
6.63k
    Txfm2dFlipCfg cfg;
3135
3136
6.63k
    svt_aom_transform_config(transform_type, TX_32X32, &cfg);
3137
3138
6.63k
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3139
6.63k
}
3140
3141
void svt_av1_transform_two_d_16x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3142
3.91k
                                     uint8_t bit_depth) {
3143
3.91k
    int32_t       intermediate_transform_buffer[16 * 16];
3144
3.91k
    Txfm2dFlipCfg cfg;
3145
3146
3.91k
    svt_aom_transform_config(transform_type, TX_16X16, &cfg);
3147
3148
3.91k
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3149
3.91k
}
3150
3151
void svt_av1_transform_two_d_8x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3152
8.44k
                                   uint8_t bit_depth) {
3153
8.44k
    int32_t       intermediate_transform_buffer[8 * 8];
3154
8.44k
    Txfm2dFlipCfg cfg;
3155
3156
8.44k
    svt_aom_transform_config(transform_type, TX_8X8, &cfg);
3157
3158
8.44k
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3159
8.44k
}
3160
3161
void svt_av1_transform_two_d_4x4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3162
5.62k
                                   uint8_t bit_depth) {
3163
5.62k
    int32_t       intermediate_transform_buffer[4 * 4];
3164
5.62k
    Txfm2dFlipCfg cfg;
3165
3166
5.62k
    svt_aom_transform_config(transform_type, TX_4X4, &cfg);
3167
3168
5.62k
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3169
5.62k
}
3170
3171
/*********************************************************************
3172
* Calculate CBF
3173
*********************************************************************/
3174
void svt_av1_fwd_txfm2d_64x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3175
2.33k
                                uint8_t bit_depth) {
3176
2.33k
    int32_t       intermediate_transform_buffer[64 * 32];
3177
2.33k
    Txfm2dFlipCfg cfg;
3178
    /*av1_get_fwd_txfm_cfg*/
3179
2.33k
    svt_aom_transform_config(transform_type, TX_64X32, &cfg);
3180
2.33k
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3181
2.33k
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3182
2.33k
}
3183
3184
2.33k
uint64_t svt_handle_transform64x32_c(int32_t* output) {
3185
    // top - right 32x32 area.
3186
2.33k
    const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 32);
3187
3188
    // Re-pack non-zero coeffs in the first 32x32 indices.
3189
74.7k
    for (int32_t row = 1; row < 32; ++row) {
3190
72.3k
        svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
3191
72.3k
    }
3192
3193
2.33k
    return three_quad_energy;
3194
2.33k
}
3195
3196
void svt_av1_fwd_txfm2d_32x64_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3197
0
                                uint8_t bit_depth) {
3198
0
    int32_t intermediate_transform_buffer[32 * 64];
3199
3200
0
    Txfm2dFlipCfg cfg;
3201
    /*av1_get_fwd_txfm_cfg*/
3202
0
    svt_aom_transform_config(transform_type, TX_32X64, &cfg);
3203
    /*fwd_txfm2d_c*/
3204
0
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3205
0
}
3206
3207
0
uint64_t svt_handle_transform32x64_c(int32_t* output) {
3208
    //bottom 32x32 area.
3209
0
    const uint64_t three_quad_energy = energy_computation(output + 32 * 32, 32, 32, 32);
3210
0
    return three_quad_energy;
3211
0
}
3212
3213
void svt_av1_fwd_txfm2d_64x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3214
0
                                uint8_t bit_depth) {
3215
0
    int32_t       intermediate_transform_buffer[64 * 16];
3216
0
    Txfm2dFlipCfg cfg;
3217
    /*av1_get_fwd_txfm_cfg*/
3218
0
    svt_aom_transform_config(transform_type, TX_64X16, &cfg);
3219
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3220
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3221
0
}
3222
3223
0
uint64_t svt_handle_transform64x16_c(int32_t* output) {
3224
    // top - right 32x16 area.
3225
0
    const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 16);
3226
3227
    // Re-pack non-zero coeffs in the first 32x16 indices.
3228
0
    for (int32_t row = 1; row < 16; ++row) {
3229
0
        svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
3230
0
    }
3231
3232
0
    return three_quad_energy;
3233
0
}
3234
3235
void svt_av1_fwd_txfm2d_16x64_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3236
0
                                uint8_t bit_depth) {
3237
0
    int32_t intermediate_transform_buffer[16 * 64];
3238
3239
0
    Txfm2dFlipCfg cfg;
3240
    /*av1_get_fwd_txfm_cfg*/
3241
0
    svt_aom_transform_config(transform_type, TX_16X64, &cfg);
3242
    /*fwd_txfm2d_c*/
3243
0
    av1_tranform_two_d_core_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3244
0
}
3245
3246
0
uint64_t svt_handle_transform16x64_c(int32_t* output) {
3247
    //bottom 16x32 area.
3248
0
    const uint64_t three_quad_energy = energy_computation(output + 16 * 32, 16, 16, 32);
3249
0
    return three_quad_energy;
3250
0
}
3251
3252
0
uint64_t svt_handle_transform16x64_N2_N4_c(int32_t* output) {
3253
0
    (void)output;
3254
0
    return 0;
3255
0
}
3256
3257
0
uint64_t svt_handle_transform32x64_N2_N4_c(int32_t* output) {
3258
0
    (void)output;
3259
0
    return 0;
3260
0
}
3261
3262
0
uint64_t svt_handle_transform64x16_N2_N4_c(int32_t* output) {
3263
    // Re-pack non-zero coeffs in the first 32x16 indices.
3264
0
    for (int32_t row = 1; row < 16; ++row) {
3265
0
        svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
3266
0
    }
3267
3268
0
    return 0;
3269
0
}
3270
3271
0
uint64_t svt_handle_transform64x32_N2_N4_c(int32_t* output) {
3272
    // Re-pack non-zero coeffs in the first 32x32 indices.
3273
0
    for (int32_t row = 1; row < 32; ++row) {
3274
0
        svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
3275
0
    }
3276
3277
0
    return 0;
3278
0
}
3279
3280
0
uint64_t svt_handle_transform64x64_N2_N4_c(int32_t* output) {
3281
    // Re-pack non-zero coeffs in the first 32x32 indices.
3282
0
    for (int32_t row = 1; row < 32; ++row) {
3283
0
        svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
3284
0
    }
3285
3286
0
    return 0;
3287
0
}
3288
3289
void svt_av1_fwd_txfm2d_32x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3290
0
                                uint8_t bit_depth) {
3291
0
    int32_t       intermediate_transform_buffer[32 * 16];
3292
0
    Txfm2dFlipCfg cfg;
3293
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_32X16, &cfg);
3294
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3295
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3296
0
}
3297
3298
void svt_av1_fwd_txfm2d_16x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3299
0
                                uint8_t bit_depth) {
3300
0
    int32_t       intermediate_transform_buffer[16 * 32];
3301
0
    Txfm2dFlipCfg cfg;
3302
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_16X32, &cfg);
3303
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3304
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3305
0
}
3306
3307
void svt_av1_fwd_txfm2d_16x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3308
0
                               uint8_t bit_depth) {
3309
0
    int32_t       intermediate_transform_buffer[16 * 8];
3310
0
    Txfm2dFlipCfg cfg;
3311
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_16X8, &cfg);
3312
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3313
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3314
0
}
3315
3316
void svt_av1_fwd_txfm2d_8x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3317
0
                               uint8_t bit_depth) {
3318
0
    int32_t       intermediate_transform_buffer[8 * 16];
3319
0
    Txfm2dFlipCfg cfg;
3320
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_8X16, &cfg);
3321
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3322
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3323
0
}
3324
3325
void svt_av1_fwd_txfm2d_32x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3326
0
                               uint8_t bit_depth) {
3327
0
    int32_t       intermediate_transform_buffer[32 * 8];
3328
0
    Txfm2dFlipCfg cfg;
3329
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_32X8, &cfg);
3330
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3331
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3332
0
}
3333
3334
void svt_av1_fwd_txfm2d_8x32_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3335
0
                               uint8_t bit_depth) {
3336
0
    int32_t       intermediate_transform_buffer[8 * 32];
3337
0
    Txfm2dFlipCfg cfg;
3338
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_8X32, &cfg);
3339
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3340
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3341
0
}
3342
3343
void svt_av1_fwd_txfm2d_16x4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3344
0
                               uint8_t bit_depth) {
3345
0
    int32_t       intermediate_transform_buffer[16 * 4];
3346
0
    Txfm2dFlipCfg cfg;
3347
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_16X4, &cfg);
3348
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3349
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3350
0
}
3351
3352
void svt_av1_fwd_txfm2d_4x16_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3353
0
                               uint8_t bit_depth) {
3354
0
    int32_t       intermediate_transform_buffer[4 * 16];
3355
0
    Txfm2dFlipCfg cfg;
3356
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_4X16, &cfg);
3357
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3358
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3359
0
}
3360
3361
void svt_av1_fwd_txfm2d_8x4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3362
0
                              uint8_t bit_depth) {
3363
0
    int32_t       intermediate_transform_buffer[8 * 4];
3364
0
    Txfm2dFlipCfg cfg;
3365
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_8X4, &cfg);
3366
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3367
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3368
0
}
3369
3370
void svt_av1_fwd_txfm2d_4x8_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
3371
0
                              uint8_t bit_depth) {
3372
0
    int32_t       intermediate_transform_buffer[4 * 8];
3373
0
    Txfm2dFlipCfg cfg;
3374
0
    /*av1_get_fwd_txfm_cfg*/ svt_aom_transform_config(transform_type, TX_4X8, &cfg);
3375
0
    /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3376
0
        input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3377
0
}
3378
3379
static EbErrorType av1_estimate_transform_N2(int16_t* residual_buffer, uint32_t residual_stride, int32_t* coeff_buffer,
3380
                                             uint32_t coeff_stride, TxSize transform_size, uint64_t* three_quad_energy,
3381
                                             uint32_t bit_depth, TxType transform_type, PlaneType component_type)
3382
3383
0
{
3384
0
    EbErrorType return_error = EB_ErrorNone;
3385
3386
0
    (void)coeff_stride;
3387
0
    (void)component_type;
3388
3389
0
    switch (transform_size) {
3390
0
    case TX_64X32:
3391
0
        if (transform_type == DCT_DCT) {
3392
0
            svt_av1_fwd_txfm2d_64x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3393
0
        } else {
3394
0
            svt_av1_fwd_txfm2d_64x32_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3395
0
        }
3396
3397
0
        *three_quad_energy = svt_handle_transform64x32_N2_N4(coeff_buffer);
3398
3399
0
        break;
3400
3401
0
    case TX_32X64:
3402
0
        if (transform_type == DCT_DCT) {
3403
0
            svt_av1_fwd_txfm2d_32x64_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3404
0
        } else {
3405
0
            svt_av1_fwd_txfm2d_32x64_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3406
0
        }
3407
3408
0
        *three_quad_energy = svt_handle_transform32x64_N2_N4(coeff_buffer);
3409
3410
0
        break;
3411
3412
0
    case TX_64X16:
3413
0
        if (transform_type == DCT_DCT) {
3414
0
            svt_av1_fwd_txfm2d_64x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3415
0
        } else {
3416
0
            svt_av1_fwd_txfm2d_64x16_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3417
0
        }
3418
3419
0
        *three_quad_energy = svt_handle_transform64x16_N2_N4(coeff_buffer);
3420
3421
0
        break;
3422
3423
0
    case TX_16X64:
3424
0
        if (transform_type == DCT_DCT) {
3425
0
            svt_av1_fwd_txfm2d_16x64_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3426
0
        } else {
3427
0
            svt_av1_fwd_txfm2d_16x64_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3428
0
        }
3429
3430
0
        *three_quad_energy = svt_handle_transform16x64_N2_N4(coeff_buffer);
3431
3432
0
        break;
3433
3434
0
    case TX_32X16:
3435
        // TTK
3436
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3437
0
            svt_av1_fwd_txfm2d_32x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3438
0
        } else {
3439
0
            svt_av1_fwd_txfm2d_32x16_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3440
0
        }
3441
0
        break;
3442
3443
0
    case TX_16X32:
3444
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3445
0
            svt_av1_fwd_txfm2d_16x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3446
0
        } else {
3447
0
            svt_av1_fwd_txfm2d_16x32_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3448
0
        }
3449
0
        break;
3450
3451
0
    case TX_16X8:
3452
0
        svt_av1_fwd_txfm2d_16x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3453
0
        break;
3454
3455
0
    case TX_8X16:
3456
0
        svt_av1_fwd_txfm2d_8x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3457
0
        break;
3458
3459
0
    case TX_32X8:
3460
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3461
0
            svt_av1_fwd_txfm2d_32x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3462
0
        } else {
3463
0
            svt_av1_fwd_txfm2d_32x8_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3464
0
        }
3465
0
        break;
3466
3467
0
    case TX_8X32:
3468
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3469
0
            svt_av1_fwd_txfm2d_8x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3470
0
        } else {
3471
0
            svt_av1_fwd_txfm2d_8x32_N2_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3472
0
        }
3473
0
        break;
3474
0
    case TX_16X4:
3475
0
        svt_av1_fwd_txfm2d_16x4_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3476
0
        break;
3477
0
    case TX_4X16:
3478
0
        svt_av1_fwd_txfm2d_4x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3479
0
        break;
3480
0
    case TX_8X4:
3481
3482
0
        svt_av1_fwd_txfm2d_8x4_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3483
3484
0
        break;
3485
0
    case TX_4X8:
3486
3487
0
        svt_av1_fwd_txfm2d_4x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3488
3489
0
        break;
3490
3491
0
    case TX_64X64:
3492
3493
0
        svt_av1_fwd_txfm2d_64x64_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3494
3495
0
        *three_quad_energy = svt_handle_transform64x64_N2_N4(coeff_buffer);
3496
3497
0
        break;
3498
3499
0
    case TX_32X32:
3500
0
        if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3501
0
            transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST) {
3502
            // Tahani: I believe those cases are never hit
3503
0
            svt_aom_transform_two_d_32x32_N2_c(
3504
0
                residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3505
0
        }
3506
3507
0
        else {
3508
0
            svt_av1_fwd_txfm2d_32x32_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3509
0
        }
3510
3511
0
        break;
3512
3513
0
    case TX_16X16:
3514
3515
0
        svt_av1_fwd_txfm2d_16x16_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3516
3517
0
        break;
3518
0
    case TX_8X8:
3519
3520
0
        svt_av1_fwd_txfm2d_8x8_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3521
3522
0
        break;
3523
0
    case TX_4X4:
3524
3525
0
        svt_av1_fwd_txfm2d_4x4_N2(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3526
3527
0
        break;
3528
0
    default:
3529
0
        assert(0);
3530
0
        break;
3531
0
    }
3532
3533
0
    return return_error;
3534
0
}
3535
3536
static EbErrorType av1_estimate_transform_N4(int16_t* residual_buffer, uint32_t residual_stride, int32_t* coeff_buffer,
3537
                                             uint32_t coeff_stride, TxSize transform_size, uint64_t* three_quad_energy,
3538
                                             uint32_t bit_depth, TxType transform_type, PlaneType component_type)
3539
3540
0
{
3541
0
    EbErrorType return_error = EB_ErrorNone;
3542
3543
0
    (void)coeff_stride;
3544
0
    (void)component_type;
3545
3546
0
    switch (transform_size) {
3547
0
    case TX_64X32:
3548
0
        if (transform_type == DCT_DCT) {
3549
0
            svt_av1_fwd_txfm2d_64x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3550
0
        } else {
3551
0
            svt_av1_fwd_txfm2d_64x32_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3552
0
        }
3553
3554
0
        *three_quad_energy = svt_handle_transform64x32_N2_N4(coeff_buffer);
3555
3556
0
        break;
3557
3558
0
    case TX_32X64:
3559
0
        if (transform_type == DCT_DCT) {
3560
0
            svt_av1_fwd_txfm2d_32x64_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3561
0
        } else {
3562
0
            svt_av1_fwd_txfm2d_32x64_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3563
0
        }
3564
3565
0
        *three_quad_energy = svt_handle_transform32x64_N2_N4(coeff_buffer);
3566
3567
0
        break;
3568
3569
0
    case TX_64X16:
3570
0
        if (transform_type == DCT_DCT) {
3571
0
            svt_av1_fwd_txfm2d_64x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3572
0
        } else {
3573
0
            svt_av1_fwd_txfm2d_64x16_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3574
0
        }
3575
3576
0
        *three_quad_energy = svt_handle_transform64x16_N2_N4(coeff_buffer);
3577
3578
0
        break;
3579
3580
0
    case TX_16X64:
3581
0
        if (transform_type == DCT_DCT) {
3582
0
            svt_av1_fwd_txfm2d_16x64_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3583
0
        } else {
3584
0
            svt_av1_fwd_txfm2d_16x64_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3585
0
        }
3586
3587
0
        *three_quad_energy = svt_handle_transform16x64_N2_N4(coeff_buffer);
3588
3589
0
        break;
3590
3591
0
    case TX_32X16:
3592
        // TTK
3593
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3594
0
            svt_av1_fwd_txfm2d_32x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3595
0
        } else {
3596
0
            svt_av1_fwd_txfm2d_32x16_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3597
0
        }
3598
0
        break;
3599
3600
0
    case TX_16X32:
3601
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3602
0
            svt_av1_fwd_txfm2d_16x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3603
0
        } else {
3604
0
            svt_av1_fwd_txfm2d_16x32_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3605
0
        }
3606
0
        break;
3607
3608
0
    case TX_16X8:
3609
0
        svt_av1_fwd_txfm2d_16x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3610
0
        break;
3611
3612
0
    case TX_8X16:
3613
0
        svt_av1_fwd_txfm2d_8x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3614
0
        break;
3615
3616
0
    case TX_32X8:
3617
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3618
0
            svt_av1_fwd_txfm2d_32x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3619
0
        } else {
3620
0
            svt_av1_fwd_txfm2d_32x8_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3621
0
        }
3622
0
        break;
3623
3624
0
    case TX_8X32:
3625
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3626
0
            svt_av1_fwd_txfm2d_8x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3627
0
        } else {
3628
0
            svt_av1_fwd_txfm2d_8x32_N4_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3629
0
        }
3630
0
        break;
3631
0
    case TX_16X4:
3632
0
        svt_av1_fwd_txfm2d_16x4_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3633
0
        break;
3634
0
    case TX_4X16:
3635
0
        svt_av1_fwd_txfm2d_4x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3636
0
        break;
3637
0
    case TX_8X4:
3638
3639
0
        svt_av1_fwd_txfm2d_8x4_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3640
3641
0
        break;
3642
0
    case TX_4X8:
3643
3644
0
        svt_av1_fwd_txfm2d_4x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3645
3646
0
        break;
3647
3648
0
    case TX_64X64:
3649
3650
0
        svt_av1_fwd_txfm2d_64x64_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3651
3652
0
        *three_quad_energy = svt_handle_transform64x64_N2_N4(coeff_buffer);
3653
3654
0
        break;
3655
3656
0
    case TX_32X32:
3657
0
        if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3658
0
            transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST) {
3659
            // Tahani: I believe those cases are never hit
3660
0
            svt_aom_transform_two_d_32x32_N4_c(
3661
0
                residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3662
0
        }
3663
3664
0
        else {
3665
0
            svt_av1_fwd_txfm2d_32x32_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3666
0
        }
3667
3668
0
        break;
3669
3670
0
    case TX_16X16:
3671
3672
0
        svt_av1_fwd_txfm2d_16x16_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3673
3674
0
        break;
3675
0
    case TX_8X8:
3676
3677
0
        svt_av1_fwd_txfm2d_8x8_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3678
3679
0
        break;
3680
0
    case TX_4X4:
3681
3682
0
        svt_av1_fwd_txfm2d_4x4_N4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3683
3684
0
        break;
3685
0
    default:
3686
0
        assert(0);
3687
0
        break;
3688
0
    }
3689
3690
0
    return return_error;
3691
0
}
3692
3693
static EbErrorType av1_estimate_transform_ONLY_DC(int16_t* residual_buffer, uint32_t residual_stride,
3694
                                                  int32_t* coeff_buffer, uint32_t coeff_stride, TxSize transform_size,
3695
                                                  uint64_t* three_quad_energy, uint32_t bit_depth,
3696
                                                  TxType transform_type, PlaneType component_type)
3697
3698
0
{
3699
0
    EbErrorType return_error = av1_estimate_transform_N4(residual_buffer,
3700
0
                                                         residual_stride,
3701
0
                                                         coeff_buffer,
3702
0
                                                         coeff_stride,
3703
0
                                                         transform_size,
3704
0
                                                         three_quad_energy,
3705
0
                                                         bit_depth,
3706
0
                                                         transform_type,
3707
0
                                                         component_type);
3708
3709
0
    for (int i = 1; i < (tx_size_wide[transform_size] * tx_size_high[transform_size]); i++) {
3710
0
        if (i % tx_size_wide[transform_size] < (tx_size_wide[transform_size] >> 2) ||
3711
0
            i / tx_size_wide[transform_size] < (tx_size_high[transform_size] >> 2)) {
3712
0
            coeff_buffer[i] = 0;
3713
0
        }
3714
0
    }
3715
0
    return return_error;
3716
0
}
3717
3718
static EbErrorType av1_estimate_transform_default(int16_t* residual_buffer, uint32_t residual_stride,
3719
                                                  int32_t* coeff_buffer, uint32_t coeff_stride, TxSize transform_size,
3720
                                                  uint64_t* three_quad_energy, uint32_t bit_depth,
3721
                                                  TxType transform_type, PlaneType component_type)
3722
3723
30.1k
{
3724
30.1k
    EbErrorType return_error = EB_ErrorNone;
3725
3726
30.1k
    (void)coeff_stride;
3727
30.1k
    (void)component_type;
3728
3729
30.1k
    switch (transform_size) {
3730
2.33k
    case TX_64X32:
3731
2.33k
        if (transform_type == DCT_DCT) {
3732
2.33k
            svt_av1_fwd_txfm2d_64x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3733
2.33k
        } else {
3734
0
            svt_av1_fwd_txfm2d_64x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3735
0
        }
3736
3737
2.33k
        *three_quad_energy = svt_handle_transform64x32(coeff_buffer);
3738
3739
2.33k
        break;
3740
3741
0
    case TX_32X64:
3742
0
        if (transform_type == DCT_DCT) {
3743
0
            svt_av1_fwd_txfm2d_32x64(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3744
0
        } else {
3745
0
            svt_av1_fwd_txfm2d_32x64_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3746
0
        }
3747
3748
0
        *three_quad_energy = svt_handle_transform32x64(coeff_buffer);
3749
3750
0
        break;
3751
3752
0
    case TX_64X16:
3753
0
        if (transform_type == DCT_DCT) {
3754
0
            svt_av1_fwd_txfm2d_64x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3755
0
        } else {
3756
0
            svt_av1_fwd_txfm2d_64x16_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3757
0
        }
3758
3759
0
        *three_quad_energy = svt_handle_transform64x16(coeff_buffer);
3760
3761
0
        break;
3762
3763
0
    case TX_16X64:
3764
0
        if (transform_type == DCT_DCT) {
3765
0
            svt_av1_fwd_txfm2d_16x64(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3766
0
        } else {
3767
0
            svt_av1_fwd_txfm2d_16x64_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3768
0
        }
3769
3770
0
        *three_quad_energy = svt_handle_transform16x64(coeff_buffer);
3771
3772
0
        break;
3773
3774
0
    case TX_32X16:
3775
        // TTK
3776
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3777
0
            svt_av1_fwd_txfm2d_32x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3778
0
        } else {
3779
0
            svt_av1_fwd_txfm2d_32x16_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3780
0
        }
3781
0
        break;
3782
3783
0
    case TX_16X32:
3784
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3785
0
            svt_av1_fwd_txfm2d_16x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3786
0
        } else {
3787
0
            svt_av1_fwd_txfm2d_16x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3788
0
        }
3789
0
        break;
3790
3791
0
    case TX_16X8:
3792
0
        svt_av1_fwd_txfm2d_16x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3793
0
        break;
3794
3795
0
    case TX_8X16:
3796
0
        svt_av1_fwd_txfm2d_8x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3797
0
        break;
3798
3799
0
    case TX_32X8:
3800
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3801
0
            svt_av1_fwd_txfm2d_32x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3802
0
        } else {
3803
0
            svt_av1_fwd_txfm2d_32x8_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3804
0
        }
3805
0
        break;
3806
3807
0
    case TX_8X32:
3808
0
        if ((transform_type == DCT_DCT) || (transform_type == IDTX)) {
3809
0
            svt_av1_fwd_txfm2d_8x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3810
0
        } else {
3811
0
            svt_av1_fwd_txfm2d_8x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3812
0
        }
3813
0
        break;
3814
0
    case TX_16X4:
3815
0
        svt_av1_fwd_txfm2d_16x4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3816
0
        break;
3817
0
    case TX_4X16:
3818
0
        svt_av1_fwd_txfm2d_4x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3819
0
        break;
3820
0
    case TX_8X4:
3821
3822
0
        svt_av1_fwd_txfm2d_8x4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3823
3824
0
        break;
3825
0
    case TX_4X8:
3826
3827
0
        svt_av1_fwd_txfm2d_4x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3828
3829
0
        break;
3830
3831
3.21k
    case TX_64X64:
3832
3833
3.21k
        svt_av1_fwd_txfm2d_64x64(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3834
3835
3.21k
        *three_quad_energy = svt_handle_transform64x64(coeff_buffer);
3836
3837
3.21k
        break;
3838
3839
6.63k
    case TX_32X32:
3840
6.63k
        if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3841
6.63k
            transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST) {
3842
            // Tahani: I believe those cases are never hit
3843
0
            svt_av1_transform_two_d_32x32_c(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3844
0
        }
3845
3846
6.63k
        else {
3847
6.63k
            svt_av1_fwd_txfm2d_32x32(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3848
6.63k
        }
3849
3850
6.63k
        break;
3851
3852
3.91k
    case TX_16X16:
3853
3854
3.91k
        svt_av1_fwd_txfm2d_16x16(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3855
3856
3.91k
        break;
3857
8.44k
    case TX_8X8:
3858
3859
8.44k
        svt_av1_fwd_txfm2d_8x8(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3860
3861
8.44k
        break;
3862
5.62k
    case TX_4X4:
3863
3864
5.62k
        svt_av1_fwd_txfm2d_4x4(residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3865
3866
5.62k
        break;
3867
0
    default:
3868
0
        assert(0);
3869
0
        break;
3870
30.1k
    }
3871
3872
30.1k
    return return_error;
3873
30.1k
}
3874
3875
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
3876
   pixel.
3877
   Shared for both high and low bit depth.
3878
 */
3879
1.20M
void svt_av1_fwht4x4_c(int16_t* input, int32_t* output, uint32_t stride) {
3880
1.20M
    int            i;
3881
1.20M
    int64_t        a1, b1, c1, d1, e1;
3882
1.20M
    const int16_t* ip_pass0 = input;
3883
1.20M
    const int32_t* ip       = NULL;
3884
1.20M
    int32_t*       op       = output;
3885
3886
5.99M
    for (i = 0; i < 4; i++) {
3887
4.79M
        a1 = ip_pass0[0 * stride];
3888
4.79M
        b1 = ip_pass0[1 * stride];
3889
4.79M
        c1 = ip_pass0[2 * stride];
3890
4.79M
        d1 = ip_pass0[3 * stride];
3891
3892
4.79M
        a1 += b1;
3893
4.79M
        d1 = d1 - c1;
3894
4.79M
        e1 = (a1 - d1) >> 1;
3895
4.79M
        b1 = e1 - b1;
3896
4.79M
        c1 = e1 - c1;
3897
4.79M
        a1 -= c1;
3898
4.79M
        d1 += b1;
3899
4.79M
        op[0] = (int32_t)a1;
3900
4.79M
        op[1] = (int32_t)c1;
3901
4.79M
        op[2] = (int32_t)d1;
3902
4.79M
        op[3] = (int32_t)b1;
3903
3904
4.79M
        ip_pass0++;
3905
4.79M
        op += 4;
3906
4.79M
    }
3907
1.20M
    ip = output;
3908
1.20M
    op = output;
3909
3910
5.97M
    for (i = 0; i < 4; i++) {
3911
4.77M
        a1 = ip[4 * 0];
3912
4.77M
        b1 = ip[4 * 1];
3913
4.77M
        c1 = ip[4 * 2];
3914
4.77M
        d1 = ip[4 * 3];
3915
3916
4.77M
        a1 += b1;
3917
4.77M
        d1 -= c1;
3918
4.77M
        e1 = (a1 - d1) >> 1;
3919
4.77M
        b1 = e1 - b1;
3920
4.77M
        c1 = e1 - c1;
3921
4.77M
        a1 -= c1;
3922
4.77M
        d1 += b1;
3923
4.77M
        op[4 * 0] = (int32_t)(a1 * UNIT_QUANT_FACTOR);
3924
4.77M
        op[4 * 1] = (int32_t)(c1 * UNIT_QUANT_FACTOR);
3925
4.77M
        op[4 * 2] = (int32_t)(d1 * UNIT_QUANT_FACTOR);
3926
4.77M
        op[4 * 3] = (int32_t)(b1 * UNIT_QUANT_FACTOR);
3927
3928
4.77M
        ip++;
3929
4.77M
        op++;
3930
4.77M
    }
3931
1.20M
}
3932
3933
/*********************************************************************
3934
* Transform
3935
*   Note there is an implicit assumption that TU Size <= PU Size,
3936
*   which is different than the HEVC requirements.
3937
*********************************************************************/
3938
EbErrorType svt_aom_estimate_transform(PictureControlSet* pcs, ModeDecisionContext* ctx, int16_t* residual_buffer,
3939
                                       uint32_t residual_stride, int32_t* coeff_buffer, uint32_t coeff_stride,
3940
                                       TxSize transform_size, uint64_t* three_quad_energy, uint32_t bit_depth,
3941
                                       TxType transform_type, PlaneType component_type, TxCoeffShape trans_coeff_shape)
3942
3943
1.22M
{
3944
1.22M
    (void)trans_coeff_shape;
3945
1.22M
    (void)coeff_stride;
3946
1.22M
    (void)component_type;
3947
3948
1.22M
    if (svt_av1_is_lossless_segment(pcs, ctx->blk_ptr->segment_id)) {
3949
1.20M
        assert(transform_type == DCT_DCT);
3950
1.20M
        int32_t dst[16];
3951
3952
1.20M
        svt_av1_fwht4x4(residual_buffer, dst, residual_stride);
3953
5.82M
        for (int i = 0; i < 4; i++) {
3954
22.8M
            for (int j = 0; j < 4; j++) {
3955
18.1M
                coeff_buffer[(j << 2) + i] = dst[(i << 2) + j];
3956
18.1M
            }
3957
4.62M
        }
3958
1.20M
        return EB_ErrorNone;
3959
1.20M
    }
3960
3961
29.1k
    switch (trans_coeff_shape) {
3962
30.1k
    case DEFAULT_SHAPE:
3963
30.1k
        return av1_estimate_transform_default(residual_buffer,
3964
30.1k
                                              residual_stride,
3965
30.1k
                                              coeff_buffer,
3966
30.1k
                                              coeff_stride,
3967
30.1k
                                              transform_size,
3968
30.1k
                                              three_quad_energy,
3969
30.1k
                                              bit_depth,
3970
30.1k
                                              transform_type,
3971
30.1k
                                              component_type);
3972
0
    case N2_SHAPE:
3973
0
        return av1_estimate_transform_N2(residual_buffer,
3974
0
                                         residual_stride,
3975
0
                                         coeff_buffer,
3976
0
                                         coeff_stride,
3977
0
                                         transform_size,
3978
0
                                         three_quad_energy,
3979
0
                                         bit_depth,
3980
0
                                         transform_type,
3981
0
                                         component_type);
3982
0
    case N4_SHAPE:
3983
0
        return av1_estimate_transform_N4(residual_buffer,
3984
0
                                         residual_stride,
3985
0
                                         coeff_buffer,
3986
0
                                         coeff_stride,
3987
0
                                         transform_size,
3988
0
                                         three_quad_energy,
3989
0
                                         bit_depth,
3990
0
                                         transform_type,
3991
0
                                         component_type);
3992
0
    case ONLY_DC_SHAPE:
3993
0
        return av1_estimate_transform_ONLY_DC(residual_buffer,
3994
0
                                              residual_stride,
3995
0
                                              coeff_buffer,
3996
0
                                              coeff_stride,
3997
0
                                              transform_size,
3998
0
                                              three_quad_energy,
3999
0
                                              bit_depth,
4000
0
                                              transform_type,
4001
0
                                              component_type);
4002
29.1k
    }
4003
4004
29.1k
    assert(0);
4005
0
    return EB_ErrorBadParameter;
4006
29.1k
}
4007
4008
// PF_N4
4009
0
static void highbd_fwd_txfm_64x64_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4010
0
    assert(txfm_param->tx_type == DCT_DCT);
4011
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4012
0
    const int bd        = txfm_param->bd;
4013
0
    svt_av1_fwd_txfm2d_64x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4014
0
}
4015
4016
0
static void highbd_fwd_txfm_32x64_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4017
0
    assert(txfm_param->tx_type == DCT_DCT);
4018
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4019
0
    const int bd        = txfm_param->bd;
4020
0
    svt_av1_fwd_txfm2d_32x64_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
4021
0
}
4022
4023
0
static void highbd_fwd_txfm_64x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4024
0
    assert(txfm_param->tx_type == DCT_DCT);
4025
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4026
0
    const int bd        = txfm_param->bd;
4027
0
    svt_av1_fwd_txfm2d_64x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
4028
0
}
4029
4030
0
static void highbd_fwd_txfm_16x64_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4031
0
    assert(txfm_param->tx_type == DCT_DCT);
4032
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4033
0
    const int bd        = txfm_param->bd;
4034
0
    svt_av1_fwd_txfm2d_16x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4035
0
}
4036
4037
0
static void highbd_fwd_txfm_64x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4038
0
    assert(txfm_param->tx_type == DCT_DCT);
4039
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4040
0
    const int bd        = txfm_param->bd;
4041
0
    svt_av1_fwd_txfm2d_64x16_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4042
0
}
4043
4044
0
static void highbd_fwd_txfm_32x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4045
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4046
0
    const TxType tx_type   = txfm_param->tx_type;
4047
0
    const int    bd        = txfm_param->bd;
4048
0
    svt_av1_fwd_txfm2d_32x32_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
4049
0
}
4050
4051
0
static void highbd_fwd_txfm_16x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4052
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4053
0
    const TxType tx_type   = txfm_param->tx_type;
4054
0
    const int    bd        = txfm_param->bd;
4055
0
    svt_av1_fwd_txfm2d_16x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
4056
0
}
4057
4058
0
static void highbd_fwd_txfm_8x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4059
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4060
0
    const TxType tx_type   = txfm_param->tx_type;
4061
0
    const int    bd        = txfm_param->bd;
4062
0
    svt_av1_fwd_txfm2d_8x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
4063
0
}
4064
4065
0
static void highbd_fwd_txfm_4x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4066
0
    int32_t* dst_coeff = (int32_t*)coeff;
4067
0
    svt_av1_fwd_txfm2d_4x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4068
0
}
4069
4070
0
static void highbd_fwd_txfm_8x4_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4071
0
    int32_t* dst_coeff = (int32_t*)coeff;
4072
0
    svt_av1_fwd_txfm2d_8x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4073
0
}
4074
4075
0
static void highbd_fwd_txfm_8x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4076
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4077
0
    const TxType tx_type   = txfm_param->tx_type;
4078
0
    const int    bd        = txfm_param->bd;
4079
0
    svt_av1_fwd_txfm2d_8x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
4080
0
}
4081
4082
0
static void highbd_fwd_txfm_16x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4083
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4084
0
    const TxType tx_type   = txfm_param->tx_type;
4085
0
    const int    bd        = txfm_param->bd;
4086
0
    svt_av1_fwd_txfm2d_16x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
4087
0
}
4088
4089
0
static void highbd_fwd_txfm_16x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4090
0
    int32_t* dst_coeff = (int32_t*)coeff;
4091
0
    svt_av1_fwd_txfm2d_16x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4092
0
}
4093
4094
0
static void highbd_fwd_txfm_32x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4095
0
    int32_t* dst_coeff = (int32_t*)coeff;
4096
0
    svt_av1_fwd_txfm2d_32x16_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4097
0
}
4098
4099
0
static void highbd_fwd_txfm_4x16_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4100
0
    int32_t* dst_coeff = (int32_t*)coeff;
4101
0
    svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4102
0
}
4103
4104
0
static void highbd_fwd_txfm_16x4_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4105
0
    int32_t* dst_coeff = (int32_t*)coeff;
4106
0
    svt_av1_fwd_txfm2d_16x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4107
0
}
4108
4109
0
static void highbd_fwd_txfm_8x32_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4110
0
    int32_t* dst_coeff = (int32_t*)coeff;
4111
0
    svt_av1_fwd_txfm2d_8x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4112
0
}
4113
4114
0
static void highbd_fwd_txfm_32x8_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4115
0
    int32_t* dst_coeff = (int32_t*)coeff;
4116
0
    svt_av1_fwd_txfm2d_32x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4117
0
}
4118
4119
//PF_N2
4120
0
static void highbd_fwd_txfm_64x64_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4121
0
    assert(txfm_param->tx_type == DCT_DCT);
4122
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4123
0
    const int bd        = txfm_param->bd;
4124
0
    svt_av1_fwd_txfm2d_64x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4125
0
}
4126
4127
0
static void highbd_fwd_txfm_32x64_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4128
0
    assert(txfm_param->tx_type == DCT_DCT);
4129
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4130
0
    const int bd        = txfm_param->bd;
4131
0
    svt_av1_fwd_txfm2d_32x64_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
4132
0
}
4133
4134
0
static void highbd_fwd_txfm_64x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4135
0
    assert(txfm_param->tx_type == DCT_DCT);
4136
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4137
0
    const int bd        = txfm_param->bd;
4138
0
    svt_av1_fwd_txfm2d_64x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
4139
0
}
4140
4141
0
static void highbd_fwd_txfm_16x64_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4142
0
    assert(txfm_param->tx_type == DCT_DCT);
4143
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4144
0
    const int bd        = txfm_param->bd;
4145
0
    svt_av1_fwd_txfm2d_16x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4146
0
}
4147
4148
0
static void highbd_fwd_txfm_64x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4149
0
    assert(txfm_param->tx_type == DCT_DCT);
4150
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4151
0
    const int bd        = txfm_param->bd;
4152
0
    svt_av1_fwd_txfm2d_64x16_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4153
0
}
4154
4155
0
static void highbd_fwd_txfm_32x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4156
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4157
0
    const TxType tx_type   = txfm_param->tx_type;
4158
0
    const int    bd        = txfm_param->bd;
4159
0
    svt_av1_fwd_txfm2d_32x32_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
4160
0
}
4161
4162
0
static void highbd_fwd_txfm_16x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4163
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4164
0
    const TxType tx_type   = txfm_param->tx_type;
4165
0
    const int    bd        = txfm_param->bd;
4166
0
    svt_av1_fwd_txfm2d_16x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
4167
0
}
4168
4169
0
static void highbd_fwd_txfm_8x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4170
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4171
0
    const TxType tx_type   = txfm_param->tx_type;
4172
0
    const int    bd        = txfm_param->bd;
4173
0
    svt_av1_fwd_txfm2d_8x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
4174
0
}
4175
4176
0
static void highbd_fwd_txfm_4x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4177
0
    int32_t* dst_coeff = (int32_t*)coeff;
4178
0
    svt_av1_fwd_txfm2d_4x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4179
0
}
4180
4181
0
static void highbd_fwd_txfm_8x4_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4182
0
    int32_t* dst_coeff = (int32_t*)coeff;
4183
0
    svt_av1_fwd_txfm2d_8x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4184
0
}
4185
4186
0
static void highbd_fwd_txfm_8x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4187
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4188
0
    const TxType tx_type   = txfm_param->tx_type;
4189
0
    const int    bd        = txfm_param->bd;
4190
0
    svt_av1_fwd_txfm2d_8x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
4191
0
}
4192
4193
0
static void highbd_fwd_txfm_16x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4194
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4195
0
    const TxType tx_type   = txfm_param->tx_type;
4196
0
    const int    bd        = txfm_param->bd;
4197
0
    svt_av1_fwd_txfm2d_16x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
4198
0
}
4199
4200
0
static void highbd_fwd_txfm_16x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4201
0
    int32_t* dst_coeff = (int32_t*)coeff;
4202
0
    svt_av1_fwd_txfm2d_16x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4203
0
}
4204
4205
0
static void highbd_fwd_txfm_32x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4206
0
    int32_t* dst_coeff = (int32_t*)coeff;
4207
0
    svt_av1_fwd_txfm2d_32x16_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4208
0
}
4209
4210
0
static void highbd_fwd_txfm_4x16_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4211
0
    int32_t* dst_coeff = (int32_t*)coeff;
4212
0
    svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4213
0
}
4214
4215
0
static void highbd_fwd_txfm_16x4_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4216
0
    int32_t* dst_coeff = (int32_t*)coeff;
4217
0
    svt_av1_fwd_txfm2d_16x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4218
0
}
4219
4220
0
static void highbd_fwd_txfm_8x32_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4221
0
    int32_t* dst_coeff = (int32_t*)coeff;
4222
0
    svt_av1_fwd_txfm2d_8x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4223
0
}
4224
4225
0
static void highbd_fwd_txfm_32x8_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4226
0
    int32_t* dst_coeff = (int32_t*)coeff;
4227
0
    svt_av1_fwd_txfm2d_32x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4228
0
}
4229
4230
0
static void highbd_fwd_txfm_64x64(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4231
0
    assert(txfm_param->tx_type == DCT_DCT);
4232
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4233
0
    const int bd        = txfm_param->bd;
4234
0
    svt_av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4235
0
}
4236
4237
0
static void highbd_fwd_txfm_32x64(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4238
0
    assert(txfm_param->tx_type == DCT_DCT);
4239
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4240
0
    const int bd        = txfm_param->bd;
4241
0
    svt_av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
4242
0
}
4243
4244
0
static void highbd_fwd_txfm_64x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4245
0
    assert(txfm_param->tx_type == DCT_DCT);
4246
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4247
0
    const int bd        = txfm_param->bd;
4248
0
    svt_av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
4249
0
}
4250
4251
0
static void highbd_fwd_txfm_16x64(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4252
0
    assert(txfm_param->tx_type == DCT_DCT);
4253
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4254
0
    const int bd        = txfm_param->bd;
4255
0
    svt_av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4256
0
}
4257
4258
0
static void highbd_fwd_txfm_64x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4259
0
    assert(txfm_param->tx_type == DCT_DCT);
4260
0
    int32_t*  dst_coeff = (int32_t*)coeff;
4261
0
    const int bd        = txfm_param->bd;
4262
0
    svt_av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
4263
0
}
4264
4265
0
static void highbd_fwd_txfm_32x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4266
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4267
0
    const TxType tx_type   = txfm_param->tx_type;
4268
0
    const int    bd        = txfm_param->bd;
4269
0
    svt_av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
4270
0
}
4271
4272
0
static void highbd_fwd_txfm_16x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4273
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4274
0
    const TxType tx_type   = txfm_param->tx_type;
4275
0
    const int    bd        = txfm_param->bd;
4276
0
    svt_av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
4277
0
}
4278
4279
0
static void highbd_fwd_txfm_8x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4280
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4281
0
    const TxType tx_type   = txfm_param->tx_type;
4282
0
    const int    bd        = txfm_param->bd;
4283
0
    svt_av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
4284
0
}
4285
4286
0
static void highbd_fwd_txfm_4x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4287
0
    int32_t* dst_coeff = (int32_t*)coeff;
4288
0
    svt_av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4289
0
}
4290
4291
0
static void highbd_fwd_txfm_8x4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4292
0
    int32_t* dst_coeff = (int32_t*)coeff;
4293
0
    svt_av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4294
0
}
4295
4296
0
static void highbd_fwd_txfm_8x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4297
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4298
0
    const TxType tx_type   = txfm_param->tx_type;
4299
0
    const int    bd        = txfm_param->bd;
4300
0
    svt_av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
4301
0
}
4302
4303
0
static void highbd_fwd_txfm_16x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4304
0
    int32_t*     dst_coeff = (int32_t*)coeff;
4305
0
    const TxType tx_type   = txfm_param->tx_type;
4306
0
    const int    bd        = txfm_param->bd;
4307
0
    svt_av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
4308
0
}
4309
4310
0
static void highbd_fwd_txfm_16x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4311
0
    int32_t* dst_coeff = (int32_t*)coeff;
4312
0
    svt_av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4313
0
}
4314
4315
0
static void highbd_fwd_txfm_32x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4316
0
    int32_t* dst_coeff = (int32_t*)coeff;
4317
0
    svt_av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4318
0
}
4319
4320
0
static void highbd_fwd_txfm_4x16(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4321
0
    int32_t* dst_coeff = (int32_t*)coeff;
4322
0
    svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4323
0
}
4324
4325
0
static void highbd_fwd_txfm_16x4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4326
0
    int32_t* dst_coeff = (int32_t*)coeff;
4327
0
    svt_av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4328
0
}
4329
4330
0
static void highbd_fwd_txfm_8x32(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4331
0
    int32_t* dst_coeff = (int32_t*)coeff;
4332
0
    svt_av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4333
0
}
4334
4335
0
static void highbd_fwd_txfm_32x8(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4336
0
    int32_t* dst_coeff = (int32_t*)coeff;
4337
0
    svt_av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4338
0
}
4339
4340
0
void svt_av1_highbd_fwd_txfm_n4(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4341
0
    assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4342
0
    const TxSize tx_size = txfm_param->tx_size;
4343
0
    switch (tx_size) {
4344
0
    case TX_64X64:
4345
0
        highbd_fwd_txfm_64x64_n4(src_diff, coeff, diff_stride, txfm_param);
4346
0
        break;
4347
0
    case TX_32X64:
4348
0
        highbd_fwd_txfm_32x64_n4(src_diff, coeff, diff_stride, txfm_param);
4349
0
        break;
4350
0
    case TX_64X32:
4351
0
        highbd_fwd_txfm_64x32_n4(src_diff, coeff, diff_stride, txfm_param);
4352
0
        break;
4353
0
    case TX_16X64:
4354
0
        highbd_fwd_txfm_16x64_n4(src_diff, coeff, diff_stride, txfm_param);
4355
0
        break;
4356
0
    case TX_64X16:
4357
0
        highbd_fwd_txfm_64x16_n4(src_diff, coeff, diff_stride, txfm_param);
4358
0
        break;
4359
0
    case TX_32X32:
4360
0
        highbd_fwd_txfm_32x32_n4(src_diff, coeff, diff_stride, txfm_param);
4361
0
        break;
4362
0
    case TX_16X16:
4363
0
        highbd_fwd_txfm_16x16_n4(src_diff, coeff, diff_stride, txfm_param);
4364
0
        break;
4365
0
    case TX_8X8:
4366
0
        highbd_fwd_txfm_8x8_n4(src_diff, coeff, diff_stride, txfm_param);
4367
0
        break;
4368
0
    case TX_4X8:
4369
0
        highbd_fwd_txfm_4x8_n4(src_diff, coeff, diff_stride, txfm_param);
4370
0
        break;
4371
0
    case TX_8X4:
4372
0
        highbd_fwd_txfm_8x4_n4(src_diff, coeff, diff_stride, txfm_param);
4373
0
        break;
4374
0
    case TX_8X16:
4375
0
        highbd_fwd_txfm_8x16_n4(src_diff, coeff, diff_stride, txfm_param);
4376
0
        break;
4377
0
    case TX_16X8:
4378
0
        highbd_fwd_txfm_16x8_n4(src_diff, coeff, diff_stride, txfm_param);
4379
0
        break;
4380
0
    case TX_16X32:
4381
0
        highbd_fwd_txfm_16x32_n4(src_diff, coeff, diff_stride, txfm_param);
4382
0
        break;
4383
0
    case TX_32X16:
4384
0
        highbd_fwd_txfm_32x16_n4(src_diff, coeff, diff_stride, txfm_param);
4385
0
        break;
4386
0
    case TX_4X4:
4387
        //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4388
0
        break;
4389
0
    case TX_4X16:
4390
0
        highbd_fwd_txfm_4x16_n4(src_diff, coeff, diff_stride, txfm_param);
4391
0
        break;
4392
0
    case TX_16X4:
4393
0
        highbd_fwd_txfm_16x4_n4(src_diff, coeff, diff_stride, txfm_param);
4394
0
        break;
4395
0
    case TX_8X32:
4396
0
        highbd_fwd_txfm_8x32_n4(src_diff, coeff, diff_stride, txfm_param);
4397
0
        break;
4398
0
    case TX_32X8:
4399
0
        highbd_fwd_txfm_32x8_n4(src_diff, coeff, diff_stride, txfm_param);
4400
0
        break;
4401
0
    default:
4402
0
        assert(0);
4403
0
        break;
4404
0
    }
4405
0
}
4406
4407
0
void svt_av1_highbd_fwd_txfm_n2(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4408
0
    assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4409
0
    const TxSize tx_size = txfm_param->tx_size;
4410
0
    switch (tx_size) {
4411
0
    case TX_64X64:
4412
0
        highbd_fwd_txfm_64x64_n2(src_diff, coeff, diff_stride, txfm_param);
4413
0
        break;
4414
0
    case TX_32X64:
4415
0
        highbd_fwd_txfm_32x64_n2(src_diff, coeff, diff_stride, txfm_param);
4416
0
        break;
4417
0
    case TX_64X32:
4418
0
        highbd_fwd_txfm_64x32_n2(src_diff, coeff, diff_stride, txfm_param);
4419
0
        break;
4420
0
    case TX_16X64:
4421
0
        highbd_fwd_txfm_16x64_n2(src_diff, coeff, diff_stride, txfm_param);
4422
0
        break;
4423
0
    case TX_64X16:
4424
0
        highbd_fwd_txfm_64x16_n2(src_diff, coeff, diff_stride, txfm_param);
4425
0
        break;
4426
0
    case TX_32X32:
4427
0
        highbd_fwd_txfm_32x32_n2(src_diff, coeff, diff_stride, txfm_param);
4428
0
        break;
4429
0
    case TX_16X16:
4430
0
        highbd_fwd_txfm_16x16_n2(src_diff, coeff, diff_stride, txfm_param);
4431
0
        break;
4432
0
    case TX_8X8:
4433
0
        highbd_fwd_txfm_8x8_n2(src_diff, coeff, diff_stride, txfm_param);
4434
0
        break;
4435
0
    case TX_4X8:
4436
0
        highbd_fwd_txfm_4x8_n2(src_diff, coeff, diff_stride, txfm_param);
4437
0
        break;
4438
0
    case TX_8X4:
4439
0
        highbd_fwd_txfm_8x4_n2(src_diff, coeff, diff_stride, txfm_param);
4440
0
        break;
4441
0
    case TX_8X16:
4442
0
        highbd_fwd_txfm_8x16_n2(src_diff, coeff, diff_stride, txfm_param);
4443
0
        break;
4444
0
    case TX_16X8:
4445
0
        highbd_fwd_txfm_16x8_n2(src_diff, coeff, diff_stride, txfm_param);
4446
0
        break;
4447
0
    case TX_16X32:
4448
0
        highbd_fwd_txfm_16x32_n2(src_diff, coeff, diff_stride, txfm_param);
4449
0
        break;
4450
0
    case TX_32X16:
4451
0
        highbd_fwd_txfm_32x16_n2(src_diff, coeff, diff_stride, txfm_param);
4452
0
        break;
4453
0
    case TX_4X4:
4454
        //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4455
0
        break;
4456
0
    case TX_4X16:
4457
0
        highbd_fwd_txfm_4x16_n2(src_diff, coeff, diff_stride, txfm_param);
4458
0
        break;
4459
0
    case TX_16X4:
4460
0
        highbd_fwd_txfm_16x4_n2(src_diff, coeff, diff_stride, txfm_param);
4461
0
        break;
4462
0
    case TX_8X32:
4463
0
        highbd_fwd_txfm_8x32_n2(src_diff, coeff, diff_stride, txfm_param);
4464
0
        break;
4465
0
    case TX_32X8:
4466
0
        highbd_fwd_txfm_32x8_n2(src_diff, coeff, diff_stride, txfm_param);
4467
0
        break;
4468
0
    default:
4469
0
        assert(0);
4470
0
        break;
4471
0
    }
4472
0
}
4473
4474
0
void svt_av1_highbd_fwd_txfm(int16_t* src_diff, TranLow* coeff, int diff_stride, TxfmParam* txfm_param) {
4475
0
    assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4476
0
    const TxSize tx_size = txfm_param->tx_size;
4477
0
    switch (tx_size) {
4478
0
    case TX_64X64:
4479
0
        highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
4480
0
        break;
4481
0
    case TX_32X64:
4482
0
        highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
4483
0
        break;
4484
0
    case TX_64X32:
4485
0
        highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
4486
0
        break;
4487
0
    case TX_16X64:
4488
0
        highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
4489
0
        break;
4490
0
    case TX_64X16:
4491
0
        highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
4492
0
        break;
4493
0
    case TX_32X32:
4494
0
        highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
4495
0
        break;
4496
0
    case TX_16X16:
4497
0
        highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
4498
0
        break;
4499
0
    case TX_8X8:
4500
0
        highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
4501
0
        break;
4502
0
    case TX_4X8:
4503
0
        highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
4504
0
        break;
4505
0
    case TX_8X4:
4506
0
        highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
4507
0
        break;
4508
0
    case TX_8X16:
4509
0
        highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
4510
0
        break;
4511
0
    case TX_16X8:
4512
0
        highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
4513
0
        break;
4514
0
    case TX_16X32:
4515
0
        highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
4516
0
        break;
4517
0
    case TX_32X16:
4518
0
        highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
4519
0
        break;
4520
0
    case TX_4X4:
4521
        //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4522
0
        break;
4523
0
    case TX_4X16:
4524
0
        highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
4525
0
        break;
4526
0
    case TX_16X4:
4527
0
        highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
4528
0
        break;
4529
0
    case TX_8X32:
4530
0
        highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
4531
0
        break;
4532
0
    case TX_32X8:
4533
0
        highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
4534
0
        break;
4535
0
    default:
4536
0
        assert(0);
4537
0
        break;
4538
0
    }
4539
0
}
4540
4541
void svt_av1_wht_fwd_txfm(int16_t* src_diff, int bw, int32_t* coeff, TxSize tx_size, TxCoeffShape pf_shape,
4542
0
                          int bit_depth, int is_hbd) {
4543
0
    TxfmParam txfm_param;
4544
0
    txfm_param.tx_type     = DCT_DCT;
4545
0
    txfm_param.tx_size     = tx_size;
4546
0
    txfm_param.lossless    = 0;
4547
0
    txfm_param.tx_set_type = EXT_TX_SET_ALL16;
4548
4549
0
    txfm_param.bd     = bit_depth;
4550
0
    txfm_param.is_hbd = is_hbd;
4551
0
    switch (pf_shape) {
4552
0
    case N4_SHAPE:
4553
0
        svt_av1_highbd_fwd_txfm_n4(src_diff, coeff, bw, &txfm_param);
4554
0
        break;
4555
0
    case N2_SHAPE:
4556
0
        svt_av1_highbd_fwd_txfm_n2(src_diff, coeff, bw, &txfm_param);
4557
0
        break;
4558
0
    default:
4559
0
        svt_av1_highbd_fwd_txfm(src_diff, coeff, bw, &txfm_param);
4560
0
    }
4561
0
}
4562
4563
0
void svt_av1_fidentity16_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
4564
0
    (void)stage_range;
4565
0
    (void)cos_bit;
4566
0
    for (int32_t i = 0; i < 8; ++i) {
4567
0
        output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
4568
0
    }
4569
4570
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
4571
0
}
4572
4573
0
void svt_av1_fadst16_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
4574
0
    (void)stage_range;
4575
0
    const int32_t* cospi;
4576
4577
0
    int32_t *bf0, *bf1;
4578
0
    int32_t  step[16];
4579
4580
    // stage 0;
4581
4582
    // stage 1;
4583
0
    assert(output != input);
4584
0
    bf1     = output;
4585
0
    bf1[0]  = input[0];
4586
0
    bf1[1]  = -input[15];
4587
0
    bf1[2]  = -input[7];
4588
0
    bf1[3]  = input[8];
4589
0
    bf1[4]  = -input[3];
4590
0
    bf1[5]  = input[12];
4591
0
    bf1[6]  = input[4];
4592
0
    bf1[7]  = -input[11];
4593
0
    bf1[8]  = -input[1];
4594
0
    bf1[9]  = input[14];
4595
0
    bf1[10] = input[6];
4596
0
    bf1[11] = -input[9];
4597
0
    bf1[12] = input[2];
4598
0
    bf1[13] = -input[13];
4599
0
    bf1[14] = -input[5];
4600
0
    bf1[15] = input[10];
4601
4602
    // stage 2
4603
0
    cospi   = cospi_arr(cos_bit);
4604
0
    bf0     = output;
4605
0
    bf1     = step;
4606
0
    bf1[0]  = bf0[0];
4607
0
    bf1[1]  = bf0[1];
4608
0
    bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
4609
0
    bf1[3]  = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
4610
0
    bf1[4]  = bf0[4];
4611
0
    bf1[5]  = bf0[5];
4612
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
4613
0
    bf1[7]  = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
4614
0
    bf1[8]  = bf0[8];
4615
0
    bf1[9]  = bf0[9];
4616
0
    bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
4617
0
    bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
4618
0
    bf1[12] = bf0[12];
4619
0
    bf1[13] = bf0[13];
4620
0
    bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
4621
0
    bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
4622
4623
    // stage 3
4624
0
    bf0     = step;
4625
0
    bf1     = output;
4626
0
    bf1[0]  = bf0[0] + bf0[2];
4627
0
    bf1[1]  = bf0[1] + bf0[3];
4628
0
    bf1[2]  = bf0[0] - bf0[2];
4629
0
    bf1[3]  = bf0[1] - bf0[3];
4630
0
    bf1[4]  = bf0[4] + bf0[6];
4631
0
    bf1[5]  = bf0[5] + bf0[7];
4632
0
    bf1[6]  = bf0[4] - bf0[6];
4633
0
    bf1[7]  = bf0[5] - bf0[7];
4634
0
    bf1[8]  = bf0[8] + bf0[10];
4635
0
    bf1[9]  = bf0[9] + bf0[11];
4636
0
    bf1[10] = bf0[8] - bf0[10];
4637
0
    bf1[11] = bf0[9] - bf0[11];
4638
0
    bf1[12] = bf0[12] + bf0[14];
4639
0
    bf1[13] = bf0[13] + bf0[15];
4640
0
    bf1[14] = bf0[12] - bf0[14];
4641
0
    bf1[15] = bf0[13] - bf0[15];
4642
4643
    // stage 4
4644
0
    cospi   = cospi_arr(cos_bit);
4645
0
    bf0     = output;
4646
0
    bf1     = step;
4647
0
    bf1[0]  = bf0[0];
4648
0
    bf1[1]  = bf0[1];
4649
0
    bf1[2]  = bf0[2];
4650
0
    bf1[3]  = bf0[3];
4651
0
    bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
4652
0
    bf1[5]  = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
4653
0
    bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
4654
0
    bf1[7]  = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
4655
0
    bf1[8]  = bf0[8];
4656
0
    bf1[9]  = bf0[9];
4657
0
    bf1[10] = bf0[10];
4658
0
    bf1[11] = bf0[11];
4659
0
    bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
4660
0
    bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
4661
0
    bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
4662
0
    bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
4663
4664
    // stage 5
4665
0
    bf0     = step;
4666
0
    bf1     = output;
4667
0
    bf1[0]  = bf0[0] + bf0[4];
4668
0
    bf1[1]  = bf0[1] + bf0[5];
4669
0
    bf1[2]  = bf0[2] + bf0[6];
4670
0
    bf1[3]  = bf0[3] + bf0[7];
4671
0
    bf1[4]  = bf0[0] - bf0[4];
4672
0
    bf1[5]  = bf0[1] - bf0[5];
4673
0
    bf1[6]  = bf0[2] - bf0[6];
4674
0
    bf1[7]  = bf0[3] - bf0[7];
4675
0
    bf1[8]  = bf0[8] + bf0[12];
4676
0
    bf1[9]  = bf0[9] + bf0[13];
4677
0
    bf1[10] = bf0[10] + bf0[14];
4678
0
    bf1[11] = bf0[11] + bf0[15];
4679
0
    bf1[12] = bf0[8] - bf0[12];
4680
0
    bf1[13] = bf0[9] - bf0[13];
4681
0
    bf1[14] = bf0[10] - bf0[14];
4682
0
    bf1[15] = bf0[11] - bf0[15];
4683
4684
    // stage 6
4685
0
    cospi   = cospi_arr(cos_bit);
4686
0
    bf0     = output;
4687
0
    bf1     = step;
4688
0
    bf1[0]  = bf0[0];
4689
0
    bf1[1]  = bf0[1];
4690
0
    bf1[2]  = bf0[2];
4691
0
    bf1[3]  = bf0[3];
4692
0
    bf1[4]  = bf0[4];
4693
0
    bf1[5]  = bf0[5];
4694
0
    bf1[6]  = bf0[6];
4695
0
    bf1[7]  = bf0[7];
4696
0
    bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
4697
0
    bf1[9]  = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
4698
0
    bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
4699
0
    bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
4700
0
    bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
4701
0
    bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
4702
0
    bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
4703
0
    bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
4704
4705
    // stage 7
4706
0
    bf0     = step;
4707
0
    bf1     = output;
4708
0
    bf1[0]  = bf0[0] + bf0[8];
4709
0
    bf1[1]  = bf0[1] + bf0[9];
4710
0
    bf1[2]  = bf0[2] + bf0[10];
4711
0
    bf1[3]  = bf0[3] + bf0[11];
4712
0
    bf1[4]  = bf0[4] + bf0[12];
4713
0
    bf1[5]  = bf0[5] + bf0[13];
4714
0
    bf1[6]  = bf0[6] + bf0[14];
4715
0
    bf1[7]  = bf0[7] + bf0[15];
4716
0
    bf1[8]  = bf0[0] - bf0[8];
4717
0
    bf1[9]  = bf0[1] - bf0[9];
4718
0
    bf1[10] = bf0[2] - bf0[10];
4719
0
    bf1[11] = bf0[3] - bf0[11];
4720
0
    bf1[12] = bf0[4] - bf0[12];
4721
0
    bf1[13] = bf0[5] - bf0[13];
4722
0
    bf1[14] = bf0[6] - bf0[14];
4723
0
    bf1[15] = bf0[7] - bf0[15];
4724
4725
    // stage 8
4726
0
    cospi   = cospi_arr(cos_bit);
4727
0
    bf0     = output;
4728
0
    bf1     = step;
4729
0
    bf1[1]  = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
4730
0
    bf1[3]  = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
4731
0
    bf1[5]  = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
4732
0
    bf1[7]  = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
4733
0
    bf1[8]  = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
4734
0
    bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
4735
0
    bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
4736
0
    bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
4737
4738
    // stage 9
4739
0
    bf0    = step;
4740
0
    bf1    = output;
4741
0
    bf1[0] = bf0[1];
4742
0
    bf1[1] = bf0[14];
4743
0
    bf1[2] = bf0[3];
4744
0
    bf1[3] = bf0[12];
4745
0
    bf1[4] = bf0[5];
4746
0
    bf1[5] = bf0[10];
4747
0
    bf1[6] = bf0[7];
4748
0
    bf1[7] = bf0[8];
4749
0
}
4750
4751
0
void svt_av1_fdct16_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
4752
0
    (void)stage_range;
4753
0
    const int32_t* cospi;
4754
4755
0
    int32_t *bf0, *bf1;
4756
0
    int32_t  step[16];
4757
4758
    // stage 0;
4759
4760
    // stage 1;
4761
0
    bf1     = output;
4762
0
    bf1[0]  = input[0] + input[15];
4763
0
    bf1[1]  = input[1] + input[14];
4764
0
    bf1[2]  = input[2] + input[13];
4765
0
    bf1[3]  = input[3] + input[12];
4766
0
    bf1[4]  = input[4] + input[11];
4767
0
    bf1[5]  = input[5] + input[10];
4768
0
    bf1[6]  = input[6] + input[9];
4769
0
    bf1[7]  = input[7] + input[8];
4770
0
    bf1[8]  = -input[8] + input[7];
4771
0
    bf1[9]  = -input[9] + input[6];
4772
0
    bf1[10] = -input[10] + input[5];
4773
0
    bf1[11] = -input[11] + input[4];
4774
0
    bf1[12] = -input[12] + input[3];
4775
0
    bf1[13] = -input[13] + input[2];
4776
0
    bf1[14] = -input[14] + input[1];
4777
0
    bf1[15] = -input[15] + input[0];
4778
4779
    // stage 2
4780
0
    cospi   = cospi_arr(cos_bit);
4781
0
    bf0     = output;
4782
0
    bf1     = step;
4783
0
    bf1[0]  = bf0[0] + bf0[7];
4784
0
    bf1[1]  = bf0[1] + bf0[6];
4785
0
    bf1[2]  = bf0[2] + bf0[5];
4786
0
    bf1[3]  = bf0[3] + bf0[4];
4787
0
    bf1[4]  = -bf0[4] + bf0[3];
4788
0
    bf1[5]  = -bf0[5] + bf0[2];
4789
0
    bf1[6]  = -bf0[6] + bf0[1];
4790
0
    bf1[7]  = -bf0[7] + bf0[0];
4791
0
    bf1[8]  = bf0[8];
4792
0
    bf1[9]  = bf0[9];
4793
0
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
4794
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
4795
0
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
4796
0
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
4797
0
    bf1[14] = bf0[14];
4798
0
    bf1[15] = bf0[15];
4799
4800
    // stage 3
4801
0
    cospi   = cospi_arr(cos_bit);
4802
0
    bf0     = step;
4803
0
    bf1     = output;
4804
0
    bf1[0]  = bf0[0] + bf0[3];
4805
0
    bf1[1]  = bf0[1] + bf0[2];
4806
0
    bf1[2]  = -bf0[2] + bf0[1];
4807
0
    bf1[3]  = -bf0[3] + bf0[0];
4808
0
    bf1[4]  = bf0[4];
4809
0
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4810
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4811
0
    bf1[7]  = bf0[7];
4812
0
    bf1[8]  = bf0[8] + bf0[11];
4813
0
    bf1[9]  = bf0[9] + bf0[10];
4814
0
    bf1[10] = -bf0[10] + bf0[9];
4815
0
    bf1[11] = -bf0[11] + bf0[8];
4816
0
    bf1[12] = -bf0[12] + bf0[15];
4817
0
    bf1[13] = -bf0[13] + bf0[14];
4818
0
    bf1[14] = bf0[14] + bf0[13];
4819
0
    bf1[15] = bf0[15] + bf0[12];
4820
4821
    // stage 4
4822
0
    cospi   = cospi_arr(cos_bit);
4823
0
    bf0     = output;
4824
0
    bf1     = step;
4825
0
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4826
0
    bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4827
0
    bf1[4]  = bf0[4] + bf0[5];
4828
0
    bf1[5]  = -bf0[5] + bf0[4];
4829
0
    bf1[6]  = -bf0[6] + bf0[7];
4830
0
    bf1[7]  = bf0[7] + bf0[6];
4831
0
    bf1[8]  = bf0[8];
4832
0
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
4833
0
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
4834
0
    bf1[11] = bf0[11];
4835
0
    bf1[12] = bf0[12];
4836
0
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
4837
0
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
4838
0
    bf1[15] = bf0[15];
4839
4840
    // stage 5
4841
0
    cospi   = cospi_arr(cos_bit);
4842
0
    bf0     = step;
4843
0
    bf1     = output;
4844
0
    bf1[0]  = bf0[0];
4845
0
    bf1[2]  = bf0[2];
4846
0
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
4847
0
    bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
4848
0
    bf1[8]  = bf0[8] + bf0[9];
4849
0
    bf1[9]  = -bf0[9] + bf0[8];
4850
0
    bf1[10] = -bf0[10] + bf0[11];
4851
0
    bf1[11] = bf0[11] + bf0[10];
4852
0
    bf1[12] = bf0[12] + bf0[13];
4853
0
    bf1[13] = -bf0[13] + bf0[12];
4854
0
    bf1[14] = -bf0[14] + bf0[15];
4855
0
    bf1[15] = bf0[15] + bf0[14];
4856
4857
    // stage 6
4858
0
    cospi   = cospi_arr(cos_bit);
4859
0
    bf0     = output;
4860
0
    bf1     = step;
4861
0
    bf1[0]  = bf0[0];
4862
0
    bf1[2]  = bf0[2];
4863
0
    bf1[4]  = bf0[4];
4864
0
    bf1[6]  = bf0[6];
4865
0
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
4866
0
    bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
4867
0
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
4868
0
    bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
4869
4870
    // stage 7
4871
0
    bf0    = step;
4872
0
    bf1    = output;
4873
0
    bf1[0] = bf0[0];
4874
0
    bf1[1] = bf0[8];
4875
0
    bf1[2] = bf0[4];
4876
0
    bf1[3] = bf0[12];
4877
0
    bf1[4] = bf0[2];
4878
0
    bf1[5] = bf0[10];
4879
0
    bf1[6] = bf0[6];
4880
0
    bf1[7] = bf0[14];
4881
0
}
4882
4883
0
void svt_av1_fidentity8_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
4884
0
    (void)stage_range;
4885
0
    (void)cos_bit;
4886
0
    for (int32_t i = 0; i < 4; ++i) {
4887
0
        output[i] = input[i] * 2;
4888
0
    }
4889
0
}
4890
4891
0
void svt_av1_fadst8_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
4892
0
    (void)stage_range;
4893
0
    const int32_t* cospi;
4894
4895
0
    int32_t *bf0, *bf1;
4896
0
    int32_t  step[8];
4897
4898
    // stage 0;
4899
4900
    // stage 1;
4901
0
    assert(output != input);
4902
0
    bf1    = output;
4903
0
    bf1[0] = input[0];
4904
0
    bf1[1] = -input[7];
4905
0
    bf1[2] = -input[3];
4906
0
    bf1[3] = input[4];
4907
0
    bf1[4] = -input[1];
4908
0
    bf1[5] = input[6];
4909
0
    bf1[6] = input[2];
4910
0
    bf1[7] = -input[5];
4911
4912
    // stage 2
4913
0
    cospi  = cospi_arr(cos_bit);
4914
0
    bf0    = output;
4915
0
    bf1    = step;
4916
0
    bf1[0] = bf0[0];
4917
0
    bf1[1] = bf0[1];
4918
0
    bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
4919
0
    bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
4920
0
    bf1[4] = bf0[4];
4921
0
    bf1[5] = bf0[5];
4922
0
    bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
4923
0
    bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
4924
4925
    // stage 3
4926
0
    bf0    = step;
4927
0
    bf1    = output;
4928
0
    bf1[0] = bf0[0] + bf0[2];
4929
0
    bf1[1] = bf0[1] + bf0[3];
4930
0
    bf1[2] = bf0[0] - bf0[2];
4931
0
    bf1[3] = bf0[1] - bf0[3];
4932
0
    bf1[4] = bf0[4] + bf0[6];
4933
0
    bf1[5] = bf0[5] + bf0[7];
4934
0
    bf1[6] = bf0[4] - bf0[6];
4935
0
    bf1[7] = bf0[5] - bf0[7];
4936
4937
    // stage 4
4938
0
    cospi  = cospi_arr(cos_bit);
4939
0
    bf0    = output;
4940
0
    bf1    = step;
4941
0
    bf1[0] = bf0[0];
4942
0
    bf1[1] = bf0[1];
4943
0
    bf1[2] = bf0[2];
4944
0
    bf1[3] = bf0[3];
4945
0
    bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
4946
0
    bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
4947
0
    bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
4948
0
    bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
4949
4950
    // stage 5
4951
0
    bf0    = step;
4952
0
    bf1    = output;
4953
0
    bf1[0] = bf0[0] + bf0[4];
4954
0
    bf1[1] = bf0[1] + bf0[5];
4955
0
    bf1[2] = bf0[2] + bf0[6];
4956
0
    bf1[3] = bf0[3] + bf0[7];
4957
0
    bf1[4] = bf0[0] - bf0[4];
4958
0
    bf1[5] = bf0[1] - bf0[5];
4959
0
    bf1[6] = bf0[2] - bf0[6];
4960
0
    bf1[7] = bf0[3] - bf0[7];
4961
4962
    // stage 6
4963
0
    cospi  = cospi_arr(cos_bit);
4964
0
    bf0    = output;
4965
0
    bf1    = step;
4966
0
    bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
4967
0
    bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
4968
0
    bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
4969
0
    bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
4970
4971
    // stage 7
4972
0
    bf0    = step;
4973
0
    bf1    = output;
4974
0
    bf1[0] = bf0[1];
4975
0
    bf1[1] = bf0[6];
4976
0
    bf1[2] = bf0[3];
4977
0
    bf1[3] = bf0[4];
4978
0
}
4979
4980
0
void svt_av1_fdct8_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
4981
0
    (void)stage_range;
4982
0
    const int32_t* cospi;
4983
4984
0
    int32_t *bf0, *bf1;
4985
0
    int32_t  step[8];
4986
4987
    // stage 0;
4988
4989
    // stage 1;
4990
0
    bf1    = output;
4991
0
    bf1[0] = input[0] + input[7];
4992
0
    bf1[1] = input[1] + input[6];
4993
0
    bf1[2] = input[2] + input[5];
4994
0
    bf1[3] = input[3] + input[4];
4995
0
    bf1[4] = -input[4] + input[3];
4996
0
    bf1[5] = -input[5] + input[2];
4997
0
    bf1[6] = -input[6] + input[1];
4998
0
    bf1[7] = -input[7] + input[0];
4999
5000
    // stage 2
5001
0
    cospi  = cospi_arr(cos_bit);
5002
0
    bf0    = output;
5003
0
    bf1    = step;
5004
0
    bf1[0] = bf0[0] + bf0[3];
5005
0
    bf1[1] = bf0[1] + bf0[2];
5006
0
    bf1[2] = -bf0[2] + bf0[1];
5007
0
    bf1[3] = -bf0[3] + bf0[0];
5008
0
    bf1[4] = bf0[4];
5009
0
    bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
5010
0
    bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
5011
0
    bf1[7] = bf0[7];
5012
5013
    // stage 3
5014
0
    cospi  = cospi_arr(cos_bit);
5015
0
    bf0    = step;
5016
0
    bf1    = output;
5017
0
    bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
5018
0
    bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
5019
0
    bf1[4] = bf0[4] + bf0[5];
5020
0
    bf1[5] = -bf0[5] + bf0[4];
5021
0
    bf1[6] = -bf0[6] + bf0[7];
5022
0
    bf1[7] = bf0[7] + bf0[6];
5023
5024
    // stage 4
5025
0
    cospi  = cospi_arr(cos_bit);
5026
0
    bf0    = output;
5027
0
    bf1    = step;
5028
0
    bf1[0] = bf0[0];
5029
0
    bf1[2] = bf0[2];
5030
0
    bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5031
0
    bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5032
5033
    // stage 5
5034
0
    bf0    = step;
5035
0
    bf1    = output;
5036
0
    bf1[0] = bf0[0];
5037
0
    bf1[1] = bf0[4];
5038
0
    bf1[2] = bf0[2];
5039
0
    bf1[3] = bf0[6];
5040
0
}
5041
5042
0
void svt_av1_fidentity4_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
5043
0
    (void)stage_range;
5044
0
    (void)cos_bit;
5045
0
    output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits);
5046
0
    output[1] = round_shift((int64_t)input[1] * new_sqrt2, new_sqrt2_bits);
5047
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
5048
0
}
5049
5050
0
void svt_av1_fadst4_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
5051
0
    (void)stage_range;
5052
0
    int32_t        bit   = cos_bit;
5053
0
    const int32_t* sinpi = sinpi_arr(bit);
5054
0
    int32_t        x0, x1, x2, x3;
5055
0
    int32_t        s0, s2, s4, s5, s7;
5056
5057
    // stage 0
5058
0
    x0 = input[0];
5059
0
    x1 = input[1];
5060
0
    x2 = input[2];
5061
0
    x3 = input[3];
5062
5063
0
    if (!(x0 | x1 | x2 | x3)) {
5064
0
        output[0] = output[1] = output[2] = output[3] = 0;
5065
0
        return;
5066
0
    }
5067
5068
    // stage 1
5069
0
    s0 = sinpi[1] * x0;
5070
0
    s2 = sinpi[2] * x1;
5071
0
    s4 = sinpi[3] * x2;
5072
0
    s5 = sinpi[4] * x3;
5073
0
    s7 = x0 + x1;
5074
5075
    // stage 2
5076
0
    s7 = s7 - x3;
5077
5078
    // stage 3
5079
0
    x0 = s0 + s2;
5080
0
    x1 = sinpi[3] * s7;
5081
5082
    // stage 4
5083
0
    x0 = x0 + s5;
5084
5085
    // stage 5
5086
0
    s0 = x0 + s4;
5087
5088
    // 1-D transform scaling factor is sqrt(2).
5089
0
    output[0] = round_shift(s0, bit);
5090
0
    output[1] = round_shift(x1, bit);
5091
0
}
5092
5093
0
void svt_av1_fdct4_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
5094
0
    (void)stage_range;
5095
0
    const int32_t* cospi;
5096
5097
0
    int32_t* bf0;
5098
0
    int32_t  step[4];
5099
5100
    // stage 1;
5101
0
    bf0    = step;
5102
0
    bf0[0] = input[0] + input[3];
5103
0
    bf0[1] = input[1] + input[2];
5104
0
    bf0[2] = -input[2] + input[1];
5105
0
    bf0[3] = -input[3] + input[0];
5106
5107
    // stage 2
5108
0
    cospi = cospi_arr(cos_bit);
5109
5110
0
    output[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
5111
0
    output[1] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
5112
0
}
5113
5114
0
void svt_av1_fdct32_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
5115
0
    (void)stage_range;
5116
0
    const int32_t* cospi;
5117
5118
0
    int32_t *bf0, *bf1;
5119
0
    int32_t  step[32];
5120
5121
    // stage 0;
5122
5123
    // stage 1;
5124
0
    bf1     = output;
5125
0
    bf1[0]  = input[0] + input[31];
5126
0
    bf1[1]  = input[1] + input[30];
5127
0
    bf1[2]  = input[2] + input[29];
5128
0
    bf1[3]  = input[3] + input[28];
5129
0
    bf1[4]  = input[4] + input[27];
5130
0
    bf1[5]  = input[5] + input[26];
5131
0
    bf1[6]  = input[6] + input[25];
5132
0
    bf1[7]  = input[7] + input[24];
5133
0
    bf1[8]  = input[8] + input[23];
5134
0
    bf1[9]  = input[9] + input[22];
5135
0
    bf1[10] = input[10] + input[21];
5136
0
    bf1[11] = input[11] + input[20];
5137
0
    bf1[12] = input[12] + input[19];
5138
0
    bf1[13] = input[13] + input[18];
5139
0
    bf1[14] = input[14] + input[17];
5140
0
    bf1[15] = input[15] + input[16];
5141
0
    bf1[16] = -input[16] + input[15];
5142
0
    bf1[17] = -input[17] + input[14];
5143
0
    bf1[18] = -input[18] + input[13];
5144
0
    bf1[19] = -input[19] + input[12];
5145
0
    bf1[20] = -input[20] + input[11];
5146
0
    bf1[21] = -input[21] + input[10];
5147
0
    bf1[22] = -input[22] + input[9];
5148
0
    bf1[23] = -input[23] + input[8];
5149
0
    bf1[24] = -input[24] + input[7];
5150
0
    bf1[25] = -input[25] + input[6];
5151
0
    bf1[26] = -input[26] + input[5];
5152
0
    bf1[27] = -input[27] + input[4];
5153
0
    bf1[28] = -input[28] + input[3];
5154
0
    bf1[29] = -input[29] + input[2];
5155
0
    bf1[30] = -input[30] + input[1];
5156
0
    bf1[31] = -input[31] + input[0];
5157
5158
    // stage 2
5159
0
    cospi   = cospi_arr(cos_bit);
5160
0
    bf0     = output;
5161
0
    bf1     = step;
5162
0
    bf1[0]  = bf0[0] + bf0[15];
5163
0
    bf1[1]  = bf0[1] + bf0[14];
5164
0
    bf1[2]  = bf0[2] + bf0[13];
5165
0
    bf1[3]  = bf0[3] + bf0[12];
5166
0
    bf1[4]  = bf0[4] + bf0[11];
5167
0
    bf1[5]  = bf0[5] + bf0[10];
5168
0
    bf1[6]  = bf0[6] + bf0[9];
5169
0
    bf1[7]  = bf0[7] + bf0[8];
5170
0
    bf1[8]  = -bf0[8] + bf0[7];
5171
0
    bf1[9]  = -bf0[9] + bf0[6];
5172
0
    bf1[10] = -bf0[10] + bf0[5];
5173
0
    bf1[11] = -bf0[11] + bf0[4];
5174
0
    bf1[12] = -bf0[12] + bf0[3];
5175
0
    bf1[13] = -bf0[13] + bf0[2];
5176
0
    bf1[14] = -bf0[14] + bf0[1];
5177
0
    bf1[15] = -bf0[15] + bf0[0];
5178
0
    bf1[16] = bf0[16];
5179
0
    bf1[17] = bf0[17];
5180
0
    bf1[18] = bf0[18];
5181
0
    bf1[19] = bf0[19];
5182
0
    bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
5183
0
    bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
5184
0
    bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
5185
0
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
5186
0
    bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
5187
0
    bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
5188
0
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
5189
0
    bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
5190
0
    bf1[28] = bf0[28];
5191
0
    bf1[29] = bf0[29];
5192
0
    bf1[30] = bf0[30];
5193
0
    bf1[31] = bf0[31];
5194
5195
    // stage 3
5196
0
    cospi   = cospi_arr(cos_bit);
5197
0
    bf0     = step;
5198
0
    bf1     = output;
5199
0
    bf1[0]  = bf0[0] + bf0[7];
5200
0
    bf1[1]  = bf0[1] + bf0[6];
5201
0
    bf1[2]  = bf0[2] + bf0[5];
5202
0
    bf1[3]  = bf0[3] + bf0[4];
5203
0
    bf1[4]  = -bf0[4] + bf0[3];
5204
0
    bf1[5]  = -bf0[5] + bf0[2];
5205
0
    bf1[6]  = -bf0[6] + bf0[1];
5206
0
    bf1[7]  = -bf0[7] + bf0[0];
5207
0
    bf1[8]  = bf0[8];
5208
0
    bf1[9]  = bf0[9];
5209
0
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
5210
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
5211
0
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
5212
0
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
5213
0
    bf1[14] = bf0[14];
5214
0
    bf1[15] = bf0[15];
5215
0
    bf1[16] = bf0[16] + bf0[23];
5216
0
    bf1[17] = bf0[17] + bf0[22];
5217
0
    bf1[18] = bf0[18] + bf0[21];
5218
0
    bf1[19] = bf0[19] + bf0[20];
5219
0
    bf1[20] = -bf0[20] + bf0[19];
5220
0
    bf1[21] = -bf0[21] + bf0[18];
5221
0
    bf1[22] = -bf0[22] + bf0[17];
5222
0
    bf1[23] = -bf0[23] + bf0[16];
5223
0
    bf1[24] = -bf0[24] + bf0[31];
5224
0
    bf1[25] = -bf0[25] + bf0[30];
5225
0
    bf1[26] = -bf0[26] + bf0[29];
5226
0
    bf1[27] = -bf0[27] + bf0[28];
5227
0
    bf1[28] = bf0[28] + bf0[27];
5228
0
    bf1[29] = bf0[29] + bf0[26];
5229
0
    bf1[30] = bf0[30] + bf0[25];
5230
0
    bf1[31] = bf0[31] + bf0[24];
5231
5232
    // stage 4
5233
0
    cospi   = cospi_arr(cos_bit);
5234
0
    bf0     = output;
5235
0
    bf1     = step;
5236
0
    bf1[0]  = bf0[0] + bf0[3];
5237
0
    bf1[1]  = bf0[1] + bf0[2];
5238
0
    bf1[2]  = -bf0[2] + bf0[1];
5239
0
    bf1[3]  = -bf0[3] + bf0[0];
5240
0
    bf1[4]  = bf0[4];
5241
0
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
5242
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
5243
0
    bf1[7]  = bf0[7];
5244
0
    bf1[8]  = bf0[8] + bf0[11];
5245
0
    bf1[9]  = bf0[9] + bf0[10];
5246
0
    bf1[10] = -bf0[10] + bf0[9];
5247
0
    bf1[11] = -bf0[11] + bf0[8];
5248
0
    bf1[12] = -bf0[12] + bf0[15];
5249
0
    bf1[13] = -bf0[13] + bf0[14];
5250
0
    bf1[14] = bf0[14] + bf0[13];
5251
0
    bf1[15] = bf0[15] + bf0[12];
5252
0
    bf1[16] = bf0[16];
5253
0
    bf1[17] = bf0[17];
5254
0
    bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
5255
0
    bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
5256
0
    bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
5257
0
    bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
5258
0
    bf1[22] = bf0[22];
5259
0
    bf1[23] = bf0[23];
5260
0
    bf1[24] = bf0[24];
5261
0
    bf1[25] = bf0[25];
5262
0
    bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
5263
0
    bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
5264
0
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
5265
0
    bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
5266
0
    bf1[30] = bf0[30];
5267
0
    bf1[31] = bf0[31];
5268
5269
    // stage 5
5270
0
    cospi   = cospi_arr(cos_bit);
5271
0
    bf0     = step;
5272
0
    bf1     = output;
5273
0
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
5274
0
    bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
5275
0
    bf1[4]  = bf0[4] + bf0[5];
5276
0
    bf1[5]  = -bf0[5] + bf0[4];
5277
0
    bf1[6]  = -bf0[6] + bf0[7];
5278
0
    bf1[7]  = bf0[7] + bf0[6];
5279
0
    bf1[8]  = bf0[8];
5280
0
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
5281
0
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
5282
0
    bf1[11] = bf0[11];
5283
0
    bf1[12] = bf0[12];
5284
0
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
5285
0
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
5286
0
    bf1[15] = bf0[15];
5287
0
    bf1[16] = bf0[16] + bf0[19];
5288
0
    bf1[17] = bf0[17] + bf0[18];
5289
0
    bf1[18] = -bf0[18] + bf0[17];
5290
0
    bf1[19] = -bf0[19] + bf0[16];
5291
0
    bf1[20] = -bf0[20] + bf0[23];
5292
0
    bf1[21] = -bf0[21] + bf0[22];
5293
0
    bf1[22] = bf0[22] + bf0[21];
5294
0
    bf1[23] = bf0[23] + bf0[20];
5295
0
    bf1[24] = bf0[24] + bf0[27];
5296
0
    bf1[25] = bf0[25] + bf0[26];
5297
0
    bf1[26] = -bf0[26] + bf0[25];
5298
0
    bf1[27] = -bf0[27] + bf0[24];
5299
0
    bf1[28] = -bf0[28] + bf0[31];
5300
0
    bf1[29] = -bf0[29] + bf0[30];
5301
0
    bf1[30] = bf0[30] + bf0[29];
5302
0
    bf1[31] = bf0[31] + bf0[28];
5303
5304
    // stage 6
5305
0
    cospi   = cospi_arr(cos_bit);
5306
0
    bf0     = output;
5307
0
    bf1     = step;
5308
0
    bf1[0]  = bf0[0];
5309
0
    bf1[2]  = bf0[2];
5310
0
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5311
0
    bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5312
0
    bf1[8]  = bf0[8] + bf0[9];
5313
0
    bf1[9]  = -bf0[9] + bf0[8];
5314
0
    bf1[10] = -bf0[10] + bf0[11];
5315
0
    bf1[11] = bf0[11] + bf0[10];
5316
0
    bf1[12] = bf0[12] + bf0[13];
5317
0
    bf1[13] = -bf0[13] + bf0[12];
5318
0
    bf1[14] = -bf0[14] + bf0[15];
5319
0
    bf1[15] = bf0[15] + bf0[14];
5320
0
    bf1[16] = bf0[16];
5321
0
    bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
5322
0
    bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
5323
0
    bf1[19] = bf0[19];
5324
0
    bf1[20] = bf0[20];
5325
0
    bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
5326
0
    bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
5327
0
    bf1[23] = bf0[23];
5328
0
    bf1[24] = bf0[24];
5329
0
    bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
5330
0
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
5331
0
    bf1[27] = bf0[27];
5332
0
    bf1[28] = bf0[28];
5333
0
    bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
5334
0
    bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
5335
0
    bf1[31] = bf0[31];
5336
5337
    // stage 7
5338
0
    cospi   = cospi_arr(cos_bit);
5339
0
    bf0     = step;
5340
0
    bf1     = output;
5341
0
    bf1[0]  = bf0[0];
5342
0
    bf1[2]  = bf0[2];
5343
0
    bf1[4]  = bf0[4];
5344
0
    bf1[6]  = bf0[6];
5345
0
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
5346
0
    bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
5347
0
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
5348
0
    bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
5349
0
    bf1[16] = bf0[16] + bf0[17];
5350
0
    bf1[17] = -bf0[17] + bf0[16];
5351
0
    bf1[18] = -bf0[18] + bf0[19];
5352
0
    bf1[19] = bf0[19] + bf0[18];
5353
0
    bf1[20] = bf0[20] + bf0[21];
5354
0
    bf1[21] = -bf0[21] + bf0[20];
5355
0
    bf1[22] = -bf0[22] + bf0[23];
5356
0
    bf1[23] = bf0[23] + bf0[22];
5357
0
    bf1[24] = bf0[24] + bf0[25];
5358
0
    bf1[25] = -bf0[25] + bf0[24];
5359
0
    bf1[26] = -bf0[26] + bf0[27];
5360
0
    bf1[27] = bf0[27] + bf0[26];
5361
0
    bf1[28] = bf0[28] + bf0[29];
5362
0
    bf1[29] = -bf0[29] + bf0[28];
5363
0
    bf1[30] = -bf0[30] + bf0[31];
5364
0
    bf1[31] = bf0[31] + bf0[30];
5365
5366
    // stage 8
5367
0
    cospi   = cospi_arr(cos_bit);
5368
0
    bf0     = output;
5369
0
    bf1     = step;
5370
0
    bf1[0]  = bf0[0];
5371
0
    bf1[2]  = bf0[2];
5372
0
    bf1[4]  = bf0[4];
5373
0
    bf1[6]  = bf0[6];
5374
0
    bf1[8]  = bf0[8];
5375
0
    bf1[10] = bf0[10];
5376
0
    bf1[12] = bf0[12];
5377
0
    bf1[14] = bf0[14];
5378
0
    bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
5379
0
    bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
5380
0
    bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
5381
0
    bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
5382
0
    bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
5383
0
    bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
5384
0
    bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
5385
0
    bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
5386
5387
    // stage 9
5388
0
    bf0     = step;
5389
0
    bf1     = output;
5390
0
    bf1[0]  = bf0[0];
5391
0
    bf1[1]  = bf0[16];
5392
0
    bf1[2]  = bf0[8];
5393
0
    bf1[3]  = bf0[24];
5394
0
    bf1[4]  = bf0[4];
5395
0
    bf1[5]  = bf0[20];
5396
0
    bf1[6]  = bf0[12];
5397
0
    bf1[7]  = bf0[28];
5398
0
    bf1[8]  = bf0[2];
5399
0
    bf1[9]  = bf0[18];
5400
0
    bf1[10] = bf0[10];
5401
0
    bf1[11] = bf0[26];
5402
0
    bf1[12] = bf0[6];
5403
0
    bf1[13] = bf0[22];
5404
0
    bf1[14] = bf0[14];
5405
0
    bf1[15] = bf0[30];
5406
0
}
5407
5408
0
void svt_av1_fidentity32_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
5409
0
    (void)stage_range;
5410
0
    (void)cos_bit;
5411
0
    for (int32_t i = 0; i < 16; ++i) {
5412
0
        output[i] = input[i] * 4;
5413
0
    }
5414
0
}
5415
5416
0
void svt_av1_fdct64_new_N2(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
5417
0
    (void)stage_range;
5418
0
    const int32_t* cospi;
5419
5420
0
    int32_t *bf0, *bf1;
5421
0
    int32_t  step[64];
5422
5423
    // stage 0;
5424
5425
    // stage 1;
5426
0
    bf1     = output;
5427
0
    bf1[0]  = input[0] + input[63];
5428
0
    bf1[1]  = input[1] + input[62];
5429
0
    bf1[2]  = input[2] + input[61];
5430
0
    bf1[3]  = input[3] + input[60];
5431
0
    bf1[4]  = input[4] + input[59];
5432
0
    bf1[5]  = input[5] + input[58];
5433
0
    bf1[6]  = input[6] + input[57];
5434
0
    bf1[7]  = input[7] + input[56];
5435
0
    bf1[8]  = input[8] + input[55];
5436
0
    bf1[9]  = input[9] + input[54];
5437
0
    bf1[10] = input[10] + input[53];
5438
0
    bf1[11] = input[11] + input[52];
5439
0
    bf1[12] = input[12] + input[51];
5440
0
    bf1[13] = input[13] + input[50];
5441
0
    bf1[14] = input[14] + input[49];
5442
0
    bf1[15] = input[15] + input[48];
5443
0
    bf1[16] = input[16] + input[47];
5444
0
    bf1[17] = input[17] + input[46];
5445
0
    bf1[18] = input[18] + input[45];
5446
0
    bf1[19] = input[19] + input[44];
5447
0
    bf1[20] = input[20] + input[43];
5448
0
    bf1[21] = input[21] + input[42];
5449
0
    bf1[22] = input[22] + input[41];
5450
0
    bf1[23] = input[23] + input[40];
5451
0
    bf1[24] = input[24] + input[39];
5452
0
    bf1[25] = input[25] + input[38];
5453
0
    bf1[26] = input[26] + input[37];
5454
0
    bf1[27] = input[27] + input[36];
5455
0
    bf1[28] = input[28] + input[35];
5456
0
    bf1[29] = input[29] + input[34];
5457
0
    bf1[30] = input[30] + input[33];
5458
0
    bf1[31] = input[31] + input[32];
5459
0
    bf1[32] = -input[32] + input[31];
5460
0
    bf1[33] = -input[33] + input[30];
5461
0
    bf1[34] = -input[34] + input[29];
5462
0
    bf1[35] = -input[35] + input[28];
5463
0
    bf1[36] = -input[36] + input[27];
5464
0
    bf1[37] = -input[37] + input[26];
5465
0
    bf1[38] = -input[38] + input[25];
5466
0
    bf1[39] = -input[39] + input[24];
5467
0
    bf1[40] = -input[40] + input[23];
5468
0
    bf1[41] = -input[41] + input[22];
5469
0
    bf1[42] = -input[42] + input[21];
5470
0
    bf1[43] = -input[43] + input[20];
5471
0
    bf1[44] = -input[44] + input[19];
5472
0
    bf1[45] = -input[45] + input[18];
5473
0
    bf1[46] = -input[46] + input[17];
5474
0
    bf1[47] = -input[47] + input[16];
5475
0
    bf1[48] = -input[48] + input[15];
5476
0
    bf1[49] = -input[49] + input[14];
5477
0
    bf1[50] = -input[50] + input[13];
5478
0
    bf1[51] = -input[51] + input[12];
5479
0
    bf1[52] = -input[52] + input[11];
5480
0
    bf1[53] = -input[53] + input[10];
5481
0
    bf1[54] = -input[54] + input[9];
5482
0
    bf1[55] = -input[55] + input[8];
5483
0
    bf1[56] = -input[56] + input[7];
5484
0
    bf1[57] = -input[57] + input[6];
5485
0
    bf1[58] = -input[58] + input[5];
5486
0
    bf1[59] = -input[59] + input[4];
5487
0
    bf1[60] = -input[60] + input[3];
5488
0
    bf1[61] = -input[61] + input[2];
5489
0
    bf1[62] = -input[62] + input[1];
5490
0
    bf1[63] = -input[63] + input[0];
5491
5492
    // stage 2
5493
0
    cospi   = cospi_arr(cos_bit);
5494
0
    bf0     = output;
5495
0
    bf1     = step;
5496
0
    bf1[0]  = bf0[0] + bf0[31];
5497
0
    bf1[1]  = bf0[1] + bf0[30];
5498
0
    bf1[2]  = bf0[2] + bf0[29];
5499
0
    bf1[3]  = bf0[3] + bf0[28];
5500
0
    bf1[4]  = bf0[4] + bf0[27];
5501
0
    bf1[5]  = bf0[5] + bf0[26];
5502
0
    bf1[6]  = bf0[6] + bf0[25];
5503
0
    bf1[7]  = bf0[7] + bf0[24];
5504
0
    bf1[8]  = bf0[8] + bf0[23];
5505
0
    bf1[9]  = bf0[9] + bf0[22];
5506
0
    bf1[10] = bf0[10] + bf0[21];
5507
0
    bf1[11] = bf0[11] + bf0[20];
5508
0
    bf1[12] = bf0[12] + bf0[19];
5509
0
    bf1[13] = bf0[13] + bf0[18];
5510
0
    bf1[14] = bf0[14] + bf0[17];
5511
0
    bf1[15] = bf0[15] + bf0[16];
5512
0
    bf1[16] = -bf0[16] + bf0[15];
5513
0
    bf1[17] = -bf0[17] + bf0[14];
5514
0
    bf1[18] = -bf0[18] + bf0[13];
5515
0
    bf1[19] = -bf0[19] + bf0[12];
5516
0
    bf1[20] = -bf0[20] + bf0[11];
5517
0
    bf1[21] = -bf0[21] + bf0[10];
5518
0
    bf1[22] = -bf0[22] + bf0[9];
5519
0
    bf1[23] = -bf0[23] + bf0[8];
5520
0
    bf1[24] = -bf0[24] + bf0[7];
5521
0
    bf1[25] = -bf0[25] + bf0[6];
5522
0
    bf1[26] = -bf0[26] + bf0[5];
5523
0
    bf1[27] = -bf0[27] + bf0[4];
5524
0
    bf1[28] = -bf0[28] + bf0[3];
5525
0
    bf1[29] = -bf0[29] + bf0[2];
5526
0
    bf1[30] = -bf0[30] + bf0[1];
5527
0
    bf1[31] = -bf0[31] + bf0[0];
5528
0
    bf1[32] = bf0[32];
5529
0
    bf1[33] = bf0[33];
5530
0
    bf1[34] = bf0[34];
5531
0
    bf1[35] = bf0[35];
5532
0
    bf1[36] = bf0[36];
5533
0
    bf1[37] = bf0[37];
5534
0
    bf1[38] = bf0[38];
5535
0
    bf1[39] = bf0[39];
5536
0
    bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
5537
0
    bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
5538
0
    bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
5539
0
    bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
5540
0
    bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
5541
0
    bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
5542
0
    bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
5543
0
    bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
5544
0
    bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
5545
0
    bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
5546
0
    bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
5547
0
    bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
5548
0
    bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
5549
0
    bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
5550
0
    bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
5551
0
    bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
5552
0
    bf1[56] = bf0[56];
5553
0
    bf1[57] = bf0[57];
5554
0
    bf1[58] = bf0[58];
5555
0
    bf1[59] = bf0[59];
5556
0
    bf1[60] = bf0[60];
5557
0
    bf1[61] = bf0[61];
5558
0
    bf1[62] = bf0[62];
5559
0
    bf1[63] = bf0[63];
5560
5561
    // stage 3
5562
0
    cospi   = cospi_arr(cos_bit);
5563
0
    bf0     = step;
5564
0
    bf1     = output;
5565
0
    bf1[0]  = bf0[0] + bf0[15];
5566
0
    bf1[1]  = bf0[1] + bf0[14];
5567
0
    bf1[2]  = bf0[2] + bf0[13];
5568
0
    bf1[3]  = bf0[3] + bf0[12];
5569
0
    bf1[4]  = bf0[4] + bf0[11];
5570
0
    bf1[5]  = bf0[5] + bf0[10];
5571
0
    bf1[6]  = bf0[6] + bf0[9];
5572
0
    bf1[7]  = bf0[7] + bf0[8];
5573
0
    bf1[8]  = -bf0[8] + bf0[7];
5574
0
    bf1[9]  = -bf0[9] + bf0[6];
5575
0
    bf1[10] = -bf0[10] + bf0[5];
5576
0
    bf1[11] = -bf0[11] + bf0[4];
5577
0
    bf1[12] = -bf0[12] + bf0[3];
5578
0
    bf1[13] = -bf0[13] + bf0[2];
5579
0
    bf1[14] = -bf0[14] + bf0[1];
5580
0
    bf1[15] = -bf0[15] + bf0[0];
5581
0
    bf1[16] = bf0[16];
5582
0
    bf1[17] = bf0[17];
5583
0
    bf1[18] = bf0[18];
5584
0
    bf1[19] = bf0[19];
5585
0
    bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
5586
0
    bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
5587
0
    bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
5588
0
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
5589
0
    bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
5590
0
    bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
5591
0
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
5592
0
    bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
5593
0
    bf1[28] = bf0[28];
5594
0
    bf1[29] = bf0[29];
5595
0
    bf1[30] = bf0[30];
5596
0
    bf1[31] = bf0[31];
5597
0
    bf1[32] = bf0[32] + bf0[47];
5598
0
    bf1[33] = bf0[33] + bf0[46];
5599
0
    bf1[34] = bf0[34] + bf0[45];
5600
0
    bf1[35] = bf0[35] + bf0[44];
5601
0
    bf1[36] = bf0[36] + bf0[43];
5602
0
    bf1[37] = bf0[37] + bf0[42];
5603
0
    bf1[38] = bf0[38] + bf0[41];
5604
0
    bf1[39] = bf0[39] + bf0[40];
5605
0
    bf1[40] = -bf0[40] + bf0[39];
5606
0
    bf1[41] = -bf0[41] + bf0[38];
5607
0
    bf1[42] = -bf0[42] + bf0[37];
5608
0
    bf1[43] = -bf0[43] + bf0[36];
5609
0
    bf1[44] = -bf0[44] + bf0[35];
5610
0
    bf1[45] = -bf0[45] + bf0[34];
5611
0
    bf1[46] = -bf0[46] + bf0[33];
5612
0
    bf1[47] = -bf0[47] + bf0[32];
5613
0
    bf1[48] = -bf0[48] + bf0[63];
5614
0
    bf1[49] = -bf0[49] + bf0[62];
5615
0
    bf1[50] = -bf0[50] + bf0[61];
5616
0
    bf1[51] = -bf0[51] + bf0[60];
5617
0
    bf1[52] = -bf0[52] + bf0[59];
5618
0
    bf1[53] = -bf0[53] + bf0[58];
5619
0
    bf1[54] = -bf0[54] + bf0[57];
5620
0
    bf1[55] = -bf0[55] + bf0[56];
5621
0
    bf1[56] = bf0[56] + bf0[55];
5622
0
    bf1[57] = bf0[57] + bf0[54];
5623
0
    bf1[58] = bf0[58] + bf0[53];
5624
0
    bf1[59] = bf0[59] + bf0[52];
5625
0
    bf1[60] = bf0[60] + bf0[51];
5626
0
    bf1[61] = bf0[61] + bf0[50];
5627
0
    bf1[62] = bf0[62] + bf0[49];
5628
0
    bf1[63] = bf0[63] + bf0[48];
5629
5630
    // stage 4
5631
0
    cospi   = cospi_arr(cos_bit);
5632
0
    bf0     = output;
5633
0
    bf1     = step;
5634
0
    bf1[0]  = bf0[0] + bf0[7];
5635
0
    bf1[1]  = bf0[1] + bf0[6];
5636
0
    bf1[2]  = bf0[2] + bf0[5];
5637
0
    bf1[3]  = bf0[3] + bf0[4];
5638
0
    bf1[4]  = -bf0[4] + bf0[3];
5639
0
    bf1[5]  = -bf0[5] + bf0[2];
5640
0
    bf1[6]  = -bf0[6] + bf0[1];
5641
0
    bf1[7]  = -bf0[7] + bf0[0];
5642
0
    bf1[8]  = bf0[8];
5643
0
    bf1[9]  = bf0[9];
5644
0
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
5645
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
5646
0
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
5647
0
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
5648
0
    bf1[14] = bf0[14];
5649
0
    bf1[15] = bf0[15];
5650
0
    bf1[16] = bf0[16] + bf0[23];
5651
0
    bf1[17] = bf0[17] + bf0[22];
5652
0
    bf1[18] = bf0[18] + bf0[21];
5653
0
    bf1[19] = bf0[19] + bf0[20];
5654
0
    bf1[20] = -bf0[20] + bf0[19];
5655
0
    bf1[21] = -bf0[21] + bf0[18];
5656
0
    bf1[22] = -bf0[22] + bf0[17];
5657
0
    bf1[23] = -bf0[23] + bf0[16];
5658
0
    bf1[24] = -bf0[24] + bf0[31];
5659
0
    bf1[25] = -bf0[25] + bf0[30];
5660
0
    bf1[26] = -bf0[26] + bf0[29];
5661
0
    bf1[27] = -bf0[27] + bf0[28];
5662
0
    bf1[28] = bf0[28] + bf0[27];
5663
0
    bf1[29] = bf0[29] + bf0[26];
5664
0
    bf1[30] = bf0[30] + bf0[25];
5665
0
    bf1[31] = bf0[31] + bf0[24];
5666
0
    bf1[32] = bf0[32];
5667
0
    bf1[33] = bf0[33];
5668
0
    bf1[34] = bf0[34];
5669
0
    bf1[35] = bf0[35];
5670
0
    bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
5671
0
    bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
5672
0
    bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
5673
0
    bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
5674
0
    bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
5675
0
    bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
5676
0
    bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
5677
0
    bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
5678
0
    bf1[44] = bf0[44];
5679
0
    bf1[45] = bf0[45];
5680
0
    bf1[46] = bf0[46];
5681
0
    bf1[47] = bf0[47];
5682
0
    bf1[48] = bf0[48];
5683
0
    bf1[49] = bf0[49];
5684
0
    bf1[50] = bf0[50];
5685
0
    bf1[51] = bf0[51];
5686
0
    bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
5687
0
    bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
5688
0
    bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
5689
0
    bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
5690
0
    bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
5691
0
    bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
5692
0
    bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
5693
0
    bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
5694
0
    bf1[60] = bf0[60];
5695
0
    bf1[61] = bf0[61];
5696
0
    bf1[62] = bf0[62];
5697
0
    bf1[63] = bf0[63];
5698
5699
    // stage 5
5700
0
    cospi   = cospi_arr(cos_bit);
5701
0
    bf0     = step;
5702
0
    bf1     = output;
5703
0
    bf1[0]  = bf0[0] + bf0[3];
5704
0
    bf1[1]  = bf0[1] + bf0[2];
5705
0
    bf1[2]  = -bf0[2] + bf0[1];
5706
0
    bf1[3]  = -bf0[3] + bf0[0];
5707
0
    bf1[4]  = bf0[4];
5708
0
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
5709
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
5710
0
    bf1[7]  = bf0[7];
5711
0
    bf1[8]  = bf0[8] + bf0[11];
5712
0
    bf1[9]  = bf0[9] + bf0[10];
5713
0
    bf1[10] = -bf0[10] + bf0[9];
5714
0
    bf1[11] = -bf0[11] + bf0[8];
5715
0
    bf1[12] = -bf0[12] + bf0[15];
5716
0
    bf1[13] = -bf0[13] + bf0[14];
5717
0
    bf1[14] = bf0[14] + bf0[13];
5718
0
    bf1[15] = bf0[15] + bf0[12];
5719
0
    bf1[16] = bf0[16];
5720
0
    bf1[17] = bf0[17];
5721
0
    bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
5722
0
    bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
5723
0
    bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
5724
0
    bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
5725
0
    bf1[22] = bf0[22];
5726
0
    bf1[23] = bf0[23];
5727
0
    bf1[24] = bf0[24];
5728
0
    bf1[25] = bf0[25];
5729
0
    bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
5730
0
    bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
5731
0
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
5732
0
    bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
5733
0
    bf1[30] = bf0[30];
5734
0
    bf1[31] = bf0[31];
5735
0
    bf1[32] = bf0[32] + bf0[39];
5736
0
    bf1[33] = bf0[33] + bf0[38];
5737
0
    bf1[34] = bf0[34] + bf0[37];
5738
0
    bf1[35] = bf0[35] + bf0[36];
5739
0
    bf1[36] = -bf0[36] + bf0[35];
5740
0
    bf1[37] = -bf0[37] + bf0[34];
5741
0
    bf1[38] = -bf0[38] + bf0[33];
5742
0
    bf1[39] = -bf0[39] + bf0[32];
5743
0
    bf1[40] = -bf0[40] + bf0[47];
5744
0
    bf1[41] = -bf0[41] + bf0[46];
5745
0
    bf1[42] = -bf0[42] + bf0[45];
5746
0
    bf1[43] = -bf0[43] + bf0[44];
5747
0
    bf1[44] = bf0[44] + bf0[43];
5748
0
    bf1[45] = bf0[45] + bf0[42];
5749
0
    bf1[46] = bf0[46] + bf0[41];
5750
0
    bf1[47] = bf0[47] + bf0[40];
5751
0
    bf1[48] = bf0[48] + bf0[55];
5752
0
    bf1[49] = bf0[49] + bf0[54];
5753
0
    bf1[50] = bf0[50] + bf0[53];
5754
0
    bf1[51] = bf0[51] + bf0[52];
5755
0
    bf1[52] = -bf0[52] + bf0[51];
5756
0
    bf1[53] = -bf0[53] + bf0[50];
5757
0
    bf1[54] = -bf0[54] + bf0[49];
5758
0
    bf1[55] = -bf0[55] + bf0[48];
5759
0
    bf1[56] = -bf0[56] + bf0[63];
5760
0
    bf1[57] = -bf0[57] + bf0[62];
5761
0
    bf1[58] = -bf0[58] + bf0[61];
5762
0
    bf1[59] = -bf0[59] + bf0[60];
5763
0
    bf1[60] = bf0[60] + bf0[59];
5764
0
    bf1[61] = bf0[61] + bf0[58];
5765
0
    bf1[62] = bf0[62] + bf0[57];
5766
0
    bf1[63] = bf0[63] + bf0[56];
5767
5768
    // stage 6
5769
0
    cospi   = cospi_arr(cos_bit);
5770
0
    bf0     = output;
5771
0
    bf1     = step;
5772
0
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
5773
0
    bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
5774
0
    bf1[4]  = bf0[4] + bf0[5];
5775
0
    bf1[5]  = -bf0[5] + bf0[4];
5776
0
    bf1[6]  = -bf0[6] + bf0[7];
5777
0
    bf1[7]  = bf0[7] + bf0[6];
5778
0
    bf1[8]  = bf0[8];
5779
0
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
5780
0
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
5781
0
    bf1[11] = bf0[11];
5782
0
    bf1[12] = bf0[12];
5783
0
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
5784
0
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
5785
0
    bf1[15] = bf0[15];
5786
0
    bf1[16] = bf0[16] + bf0[19];
5787
0
    bf1[17] = bf0[17] + bf0[18];
5788
0
    bf1[18] = -bf0[18] + bf0[17];
5789
0
    bf1[19] = -bf0[19] + bf0[16];
5790
0
    bf1[20] = -bf0[20] + bf0[23];
5791
0
    bf1[21] = -bf0[21] + bf0[22];
5792
0
    bf1[22] = bf0[22] + bf0[21];
5793
0
    bf1[23] = bf0[23] + bf0[20];
5794
0
    bf1[24] = bf0[24] + bf0[27];
5795
0
    bf1[25] = bf0[25] + bf0[26];
5796
0
    bf1[26] = -bf0[26] + bf0[25];
5797
0
    bf1[27] = -bf0[27] + bf0[24];
5798
0
    bf1[28] = -bf0[28] + bf0[31];
5799
0
    bf1[29] = -bf0[29] + bf0[30];
5800
0
    bf1[30] = bf0[30] + bf0[29];
5801
0
    bf1[31] = bf0[31] + bf0[28];
5802
0
    bf1[32] = bf0[32];
5803
0
    bf1[33] = bf0[33];
5804
0
    bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
5805
0
    bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
5806
0
    bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
5807
0
    bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
5808
0
    bf1[38] = bf0[38];
5809
0
    bf1[39] = bf0[39];
5810
0
    bf1[40] = bf0[40];
5811
0
    bf1[41] = bf0[41];
5812
0
    bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
5813
0
    bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
5814
0
    bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
5815
0
    bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
5816
0
    bf1[46] = bf0[46];
5817
0
    bf1[47] = bf0[47];
5818
0
    bf1[48] = bf0[48];
5819
0
    bf1[49] = bf0[49];
5820
0
    bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
5821
0
    bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
5822
0
    bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
5823
0
    bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
5824
0
    bf1[54] = bf0[54];
5825
0
    bf1[55] = bf0[55];
5826
0
    bf1[56] = bf0[56];
5827
0
    bf1[57] = bf0[57];
5828
0
    bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
5829
0
    bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
5830
0
    bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
5831
0
    bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
5832
0
    bf1[62] = bf0[62];
5833
0
    bf1[63] = bf0[63];
5834
5835
    // stage 7
5836
0
    cospi   = cospi_arr(cos_bit);
5837
0
    bf0     = step;
5838
0
    bf1     = output;
5839
0
    bf1[0]  = bf0[0];
5840
0
    bf1[2]  = bf0[2];
5841
0
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5842
0
    bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5843
0
    bf1[8]  = bf0[8] + bf0[9];
5844
0
    bf1[9]  = -bf0[9] + bf0[8];
5845
0
    bf1[10] = -bf0[10] + bf0[11];
5846
0
    bf1[11] = bf0[11] + bf0[10];
5847
0
    bf1[12] = bf0[12] + bf0[13];
5848
0
    bf1[13] = -bf0[13] + bf0[12];
5849
0
    bf1[14] = -bf0[14] + bf0[15];
5850
0
    bf1[15] = bf0[15] + bf0[14];
5851
0
    bf1[16] = bf0[16];
5852
0
    bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
5853
0
    bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
5854
0
    bf1[19] = bf0[19];
5855
0
    bf1[20] = bf0[20];
5856
0
    bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
5857
0
    bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
5858
0
    bf1[23] = bf0[23];
5859
0
    bf1[24] = bf0[24];
5860
0
    bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
5861
0
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
5862
0
    bf1[27] = bf0[27];
5863
0
    bf1[28] = bf0[28];
5864
0
    bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
5865
0
    bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
5866
0
    bf1[31] = bf0[31];
5867
0
    bf1[32] = bf0[32] + bf0[35];
5868
0
    bf1[33] = bf0[33] + bf0[34];
5869
0
    bf1[34] = -bf0[34] + bf0[33];
5870
0
    bf1[35] = -bf0[35] + bf0[32];
5871
0
    bf1[36] = -bf0[36] + bf0[39];
5872
0
    bf1[37] = -bf0[37] + bf0[38];
5873
0
    bf1[38] = bf0[38] + bf0[37];
5874
0
    bf1[39] = bf0[39] + bf0[36];
5875
0
    bf1[40] = bf0[40] + bf0[43];
5876
0
    bf1[41] = bf0[41] + bf0[42];
5877
0
    bf1[42] = -bf0[42] + bf0[41];
5878
0
    bf1[43] = -bf0[43] + bf0[40];
5879
0
    bf1[44] = -bf0[44] + bf0[47];
5880
0
    bf1[45] = -bf0[45] + bf0[46];
5881
0
    bf1[46] = bf0[46] + bf0[45];
5882
0
    bf1[47] = bf0[47] + bf0[44];
5883
0
    bf1[48] = bf0[48] + bf0[51];
5884
0
    bf1[49] = bf0[49] + bf0[50];
5885
0
    bf1[50] = -bf0[50] + bf0[49];
5886
0
    bf1[51] = -bf0[51] + bf0[48];
5887
0
    bf1[52] = -bf0[52] + bf0[55];
5888
0
    bf1[53] = -bf0[53] + bf0[54];
5889
0
    bf1[54] = bf0[54] + bf0[53];
5890
0
    bf1[55] = bf0[55] + bf0[52];
5891
0
    bf1[56] = bf0[56] + bf0[59];
5892
0
    bf1[57] = bf0[57] + bf0[58];
5893
0
    bf1[58] = -bf0[58] + bf0[57];
5894
0
    bf1[59] = -bf0[59] + bf0[56];
5895
0
    bf1[60] = -bf0[60] + bf0[63];
5896
0
    bf1[61] = -bf0[61] + bf0[62];
5897
0
    bf1[62] = bf0[62] + bf0[61];
5898
0
    bf1[63] = bf0[63] + bf0[60];
5899
5900
    // stage 8
5901
0
    cospi   = cospi_arr(cos_bit);
5902
0
    bf0     = output;
5903
0
    bf1     = step;
5904
0
    bf1[0]  = bf0[0];
5905
0
    bf1[2]  = bf0[2];
5906
0
    bf1[4]  = bf0[4];
5907
0
    bf1[6]  = bf0[6];
5908
0
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
5909
0
    bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
5910
0
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
5911
0
    bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
5912
0
    bf1[16] = bf0[16] + bf0[17];
5913
0
    bf1[17] = -bf0[17] + bf0[16];
5914
0
    bf1[18] = -bf0[18] + bf0[19];
5915
0
    bf1[19] = bf0[19] + bf0[18];
5916
0
    bf1[20] = bf0[20] + bf0[21];
5917
0
    bf1[21] = -bf0[21] + bf0[20];
5918
0
    bf1[22] = -bf0[22] + bf0[23];
5919
0
    bf1[23] = bf0[23] + bf0[22];
5920
0
    bf1[24] = bf0[24] + bf0[25];
5921
0
    bf1[25] = -bf0[25] + bf0[24];
5922
0
    bf1[26] = -bf0[26] + bf0[27];
5923
0
    bf1[27] = bf0[27] + bf0[26];
5924
0
    bf1[28] = bf0[28] + bf0[29];
5925
0
    bf1[29] = -bf0[29] + bf0[28];
5926
0
    bf1[30] = -bf0[30] + bf0[31];
5927
0
    bf1[31] = bf0[31] + bf0[30];
5928
0
    bf1[32] = bf0[32];
5929
0
    bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
5930
0
    bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
5931
0
    bf1[35] = bf0[35];
5932
0
    bf1[36] = bf0[36];
5933
0
    bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
5934
0
    bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
5935
0
    bf1[39] = bf0[39];
5936
0
    bf1[40] = bf0[40];
5937
0
    bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
5938
0
    bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
5939
0
    bf1[43] = bf0[43];
5940
0
    bf1[44] = bf0[44];
5941
0
    bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
5942
0
    bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
5943
0
    bf1[47] = bf0[47];
5944
0
    bf1[48] = bf0[48];
5945
0
    bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
5946
0
    bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
5947
0
    bf1[51] = bf0[51];
5948
0
    bf1[52] = bf0[52];
5949
0
    bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
5950
0
    bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
5951
0
    bf1[55] = bf0[55];
5952
0
    bf1[56] = bf0[56];
5953
0
    bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
5954
0
    bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
5955
0
    bf1[59] = bf0[59];
5956
0
    bf1[60] = bf0[60];
5957
0
    bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
5958
0
    bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
5959
0
    bf1[63] = bf0[63];
5960
5961
    // stage 9
5962
0
    cospi   = cospi_arr(cos_bit);
5963
0
    bf0     = step;
5964
0
    bf1     = output;
5965
0
    bf1[0]  = bf0[0];
5966
0
    bf1[2]  = bf0[2];
5967
0
    bf1[4]  = bf0[4];
5968
0
    bf1[6]  = bf0[6];
5969
0
    bf1[8]  = bf0[8];
5970
0
    bf1[10] = bf0[10];
5971
0
    bf1[12] = bf0[12];
5972
0
    bf1[14] = bf0[14];
5973
0
    bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
5974
0
    bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
5975
0
    bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
5976
0
    bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
5977
0
    bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
5978
0
    bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
5979
0
    bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
5980
0
    bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
5981
0
    bf1[32] = bf0[32] + bf0[33];
5982
0
    bf1[33] = -bf0[33] + bf0[32];
5983
0
    bf1[34] = -bf0[34] + bf0[35];
5984
0
    bf1[35] = bf0[35] + bf0[34];
5985
0
    bf1[36] = bf0[36] + bf0[37];
5986
0
    bf1[37] = -bf0[37] + bf0[36];
5987
0
    bf1[38] = -bf0[38] + bf0[39];
5988
0
    bf1[39] = bf0[39] + bf0[38];
5989
0
    bf1[40] = bf0[40] + bf0[41];
5990
0
    bf1[41] = -bf0[41] + bf0[40];
5991
0
    bf1[42] = -bf0[42] + bf0[43];
5992
0
    bf1[43] = bf0[43] + bf0[42];
5993
0
    bf1[44] = bf0[44] + bf0[45];
5994
0
    bf1[45] = -bf0[45] + bf0[44];
5995
0
    bf1[46] = -bf0[46] + bf0[47];
5996
0
    bf1[47] = bf0[47] + bf0[46];
5997
0
    bf1[48] = bf0[48] + bf0[49];
5998
0
    bf1[49] = -bf0[49] + bf0[48];
5999
0
    bf1[50] = -bf0[50] + bf0[51];
6000
0
    bf1[51] = bf0[51] + bf0[50];
6001
0
    bf1[52] = bf0[52] + bf0[53];
6002
0
    bf1[53] = -bf0[53] + bf0[52];
6003
0
    bf1[54] = -bf0[54] + bf0[55];
6004
0
    bf1[55] = bf0[55] + bf0[54];
6005
0
    bf1[56] = bf0[56] + bf0[57];
6006
0
    bf1[57] = -bf0[57] + bf0[56];
6007
0
    bf1[58] = -bf0[58] + bf0[59];
6008
0
    bf1[59] = bf0[59] + bf0[58];
6009
0
    bf1[60] = bf0[60] + bf0[61];
6010
0
    bf1[61] = -bf0[61] + bf0[60];
6011
0
    bf1[62] = -bf0[62] + bf0[63];
6012
0
    bf1[63] = bf0[63] + bf0[62];
6013
6014
    // stage 10
6015
0
    cospi   = cospi_arr(cos_bit);
6016
0
    bf0     = output;
6017
0
    bf1     = step;
6018
0
    bf1[0]  = bf0[0];
6019
0
    bf1[2]  = bf0[2];
6020
0
    bf1[4]  = bf0[4];
6021
0
    bf1[6]  = bf0[6];
6022
0
    bf1[8]  = bf0[8];
6023
0
    bf1[10] = bf0[10];
6024
0
    bf1[12] = bf0[12];
6025
0
    bf1[14] = bf0[14];
6026
0
    bf1[16] = bf0[16];
6027
0
    bf1[18] = bf0[18];
6028
0
    bf1[20] = bf0[20];
6029
0
    bf1[22] = bf0[22];
6030
0
    bf1[24] = bf0[24];
6031
0
    bf1[26] = bf0[26];
6032
0
    bf1[28] = bf0[28];
6033
0
    bf1[30] = bf0[30];
6034
0
    bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
6035
0
    bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
6036
0
    bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
6037
0
    bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
6038
0
    bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
6039
0
    bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
6040
0
    bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
6041
0
    bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
6042
0
    bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
6043
0
    bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
6044
0
    bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
6045
0
    bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
6046
0
    bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
6047
0
    bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
6048
0
    bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
6049
0
    bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
6050
6051
    // stage 11
6052
0
    bf0     = step;
6053
0
    bf1     = output;
6054
0
    bf1[0]  = bf0[0];
6055
0
    bf1[1]  = bf0[32];
6056
0
    bf1[2]  = bf0[16];
6057
0
    bf1[3]  = bf0[48];
6058
0
    bf1[4]  = bf0[8];
6059
0
    bf1[5]  = bf0[40];
6060
0
    bf1[6]  = bf0[24];
6061
0
    bf1[7]  = bf0[56];
6062
0
    bf1[8]  = bf0[4];
6063
0
    bf1[9]  = bf0[36];
6064
0
    bf1[10] = bf0[20];
6065
0
    bf1[11] = bf0[52];
6066
0
    bf1[12] = bf0[12];
6067
0
    bf1[13] = bf0[44];
6068
0
    bf1[14] = bf0[28];
6069
0
    bf1[15] = bf0[60];
6070
0
    bf1[16] = bf0[2];
6071
0
    bf1[17] = bf0[34];
6072
0
    bf1[18] = bf0[18];
6073
0
    bf1[19] = bf0[50];
6074
0
    bf1[20] = bf0[10];
6075
0
    bf1[21] = bf0[42];
6076
0
    bf1[22] = bf0[26];
6077
0
    bf1[23] = bf0[58];
6078
0
    bf1[24] = bf0[6];
6079
0
    bf1[25] = bf0[38];
6080
0
    bf1[26] = bf0[22];
6081
0
    bf1[27] = bf0[54];
6082
0
    bf1[28] = bf0[14];
6083
0
    bf1[29] = bf0[46];
6084
0
    bf1[30] = bf0[30];
6085
0
    bf1[31] = bf0[62];
6086
0
}
6087
6088
0
static void av1_fidentity64_N2_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6089
0
    (void)stage_range;
6090
0
    (void)cos_bit;
6091
0
    for (int32_t i = 0; i < 32; ++i) {
6092
0
        output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
6093
0
    }
6094
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
6095
0
}
6096
6097
0
static INLINE TxfmFunc fwd_txfm_type_to_func_N2(TxfmType txfmtype) {
6098
0
    switch (txfmtype) {
6099
0
    case TXFM_TYPE_DCT4:
6100
0
        return svt_av1_fdct4_new_N2;
6101
0
    case TXFM_TYPE_DCT8:
6102
0
        return svt_av1_fdct8_new_N2;
6103
0
    case TXFM_TYPE_DCT16:
6104
0
        return svt_av1_fdct16_new_N2;
6105
0
    case TXFM_TYPE_DCT32:
6106
0
        return svt_av1_fdct32_new_N2;
6107
0
    case TXFM_TYPE_DCT64:
6108
0
        return svt_av1_fdct64_new_N2;
6109
0
    case TXFM_TYPE_ADST4:
6110
0
        return svt_av1_fadst4_new_N2;
6111
0
    case TXFM_TYPE_ADST8:
6112
0
        return svt_av1_fadst8_new_N2;
6113
0
    case TXFM_TYPE_ADST16:
6114
0
        return svt_av1_fadst16_new_N2;
6115
0
    case TXFM_TYPE_ADST32:
6116
0
        return av1_fadst32_new;
6117
0
    case TXFM_TYPE_IDENTITY4:
6118
0
        return svt_av1_fidentity4_N2_c;
6119
0
    case TXFM_TYPE_IDENTITY8:
6120
0
        return svt_av1_fidentity8_N2_c;
6121
0
    case TXFM_TYPE_IDENTITY16:
6122
0
        return svt_av1_fidentity16_N2_c;
6123
0
    case TXFM_TYPE_IDENTITY32:
6124
0
        return svt_av1_fidentity32_N2_c;
6125
0
    case TXFM_TYPE_IDENTITY64:
6126
0
        return av1_fidentity64_N2_c;
6127
0
    default:
6128
0
        assert(0);
6129
0
        return NULL;
6130
0
    }
6131
0
}
6132
6133
static INLINE void av1_tranform_two_d_core_N2_c(int16_t* input, uint32_t input_stride, int32_t* output,
6134
0
                                                const Txfm2dFlipCfg* cfg, int32_t* buf, uint8_t bit_depth) {
6135
0
    int32_t c, r;
6136
    // Note when assigning txfm_size_col, we use the txfm_size from the
6137
    // row configuration and vice versa. This is intentionally done to
6138
    // accurately perform rectangular transforms. When the transform is
6139
    // rectangular, the number of columns will be the same as the
6140
    // txfm_size stored in the row cfg struct. It will make no difference
6141
    // for square transforms.
6142
0
    const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
6143
0
    const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
6144
    // Take the shift from the larger dimension in the rectangular case.
6145
0
    const int8_t* shift     = cfg->shift;
6146
0
    const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
6147
0
    int8_t        stage_range_col[MAX_TXFM_STAGE_NUM];
6148
0
    int8_t        stage_range_row[MAX_TXFM_STAGE_NUM];
6149
0
    assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
6150
0
    assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
6151
0
    svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
6152
6153
0
    const int8_t   cos_bit_col   = cfg->cos_bit_col;
6154
0
    const int8_t   cos_bit_row   = cfg->cos_bit_row;
6155
0
    const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N2(cfg->txfm_type_col);
6156
0
    const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N2(cfg->txfm_type_row);
6157
0
    ASSERT(txfm_func_col != NULL);
6158
0
    ASSERT(txfm_func_row != NULL);
6159
    // use output buffer as temp buffer
6160
0
    int32_t* temp_in  = output;
6161
0
    int32_t* temp_out = output + txfm_size_row;
6162
6163
    // Columns
6164
0
    for (c = 0; c < txfm_size_col; ++c) {
6165
0
        if (cfg->ud_flip == 0) {
6166
0
            for (r = 0; r < txfm_size_row; ++r) {
6167
0
                temp_in[r] = input[r * input_stride + c];
6168
0
            }
6169
0
        } else {
6170
0
            for (r = 0; r < txfm_size_row; ++r) {
6171
                // flip upside down
6172
0
                temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
6173
0
            }
6174
0
        }
6175
0
        svt_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
6176
0
        txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
6177
0
        svt_av1_round_shift_array_c(temp_out, txfm_size_row / 2, -shift[1]); // NM svt_av1_round_shift_array_c
6178
0
        if (cfg->lr_flip == 0) {
6179
0
            for (r = 0; r < txfm_size_row; ++r) {
6180
0
                buf[r * txfm_size_col + c] = temp_out[r];
6181
0
            }
6182
0
        } else {
6183
0
            for (r = 0; r < txfm_size_row; ++r) {
6184
                // flip from left to right
6185
0
                buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
6186
0
            }
6187
0
        }
6188
0
    }
6189
6190
    // Rows
6191
0
    for (r = 0; r < txfm_size_row / 2; ++r) {
6192
0
        txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
6193
0
        svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 2, -shift[2]);
6194
6195
0
        if (abs(rect_type) == 1) {
6196
            // Multiply everything by Sqrt2 if the transform is rectangular and the
6197
            // size difference is a factor of 2.
6198
0
            for (c = 0; c < txfm_size_col / 2; ++c) {
6199
0
                output[r * txfm_size_col + c] = round_shift((int64_t)output[r * txfm_size_col + c] * new_sqrt2,
6200
0
                                                            new_sqrt2_bits);
6201
0
            }
6202
0
        }
6203
0
    }
6204
6205
0
    for (int i = 0; i < (txfm_size_col * txfm_size_row); i++) {
6206
0
        if (i % txfm_size_col >= (txfm_size_col >> 1) || i / txfm_size_col >= (txfm_size_row >> 1)) {
6207
0
            output[i] = 0;
6208
0
        }
6209
0
    }
6210
0
}
6211
6212
void svt_aom_transform_two_d_64x64_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6213
0
                                        uint8_t bit_depth) {
6214
0
    int32_t       intermediate_transform_buffer[64 * 64];
6215
0
    Txfm2dFlipCfg cfg;
6216
0
    svt_aom_transform_config(transform_type, TX_64X64, &cfg);
6217
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6218
0
}
6219
6220
void svt_aom_transform_two_d_32x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6221
0
                                        uint8_t bit_depth) {
6222
0
    int32_t       intermediate_transform_buffer[32 * 32];
6223
0
    Txfm2dFlipCfg cfg;
6224
0
    svt_aom_transform_config(transform_type, TX_32X32, &cfg);
6225
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6226
0
}
6227
6228
void svt_aom_transform_two_d_16x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6229
0
                                        uint8_t bit_depth) {
6230
0
    int32_t       intermediate_transform_buffer[16 * 16];
6231
0
    Txfm2dFlipCfg cfg;
6232
0
    svt_aom_transform_config(transform_type, TX_16X16, &cfg);
6233
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6234
0
}
6235
6236
void svt_aom_transform_two_d_8x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6237
0
                                      uint8_t bit_depth) {
6238
0
    int32_t       intermediate_transform_buffer[8 * 8];
6239
0
    Txfm2dFlipCfg cfg;
6240
0
    svt_aom_transform_config(transform_type, TX_8X8, &cfg);
6241
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6242
0
}
6243
6244
void svt_aom_transform_two_d_4x4_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6245
0
                                      uint8_t bit_depth) {
6246
0
    int32_t       intermediate_transform_buffer[4 * 4];
6247
0
    Txfm2dFlipCfg cfg;
6248
0
    svt_aom_transform_config(transform_type, TX_4X4, &cfg);
6249
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6250
0
}
6251
6252
void svt_av1_fwd_txfm2d_64x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6253
0
                                   uint8_t bit_depth) {
6254
0
    int32_t       intermediate_transform_buffer[64 * 32];
6255
0
    Txfm2dFlipCfg cfg;
6256
0
    svt_aom_transform_config(transform_type, TX_64X32, &cfg);
6257
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6258
0
}
6259
6260
void svt_av1_fwd_txfm2d_32x64_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6261
0
                                   uint8_t bit_depth) {
6262
0
    int32_t       intermediate_transform_buffer[32 * 64];
6263
0
    Txfm2dFlipCfg cfg;
6264
0
    svt_aom_transform_config(transform_type, TX_32X64, &cfg);
6265
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6266
0
}
6267
6268
void svt_av1_fwd_txfm2d_64x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6269
0
                                   uint8_t bit_depth) {
6270
0
    int32_t       intermediate_transform_buffer[64 * 16];
6271
0
    Txfm2dFlipCfg cfg;
6272
0
    svt_aom_transform_config(transform_type, TX_64X16, &cfg);
6273
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6274
0
}
6275
6276
void svt_av1_fwd_txfm2d_16x64_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6277
0
                                   uint8_t bit_depth) {
6278
0
    int32_t       intermediate_transform_buffer[16 * 64];
6279
0
    Txfm2dFlipCfg cfg;
6280
0
    svt_aom_transform_config(transform_type, TX_16X64, &cfg);
6281
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6282
0
}
6283
6284
void svt_av1_fwd_txfm2d_32x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6285
0
                                   uint8_t bit_depth) {
6286
0
    int32_t       intermediate_transform_buffer[32 * 16];
6287
0
    Txfm2dFlipCfg cfg;
6288
0
    svt_aom_transform_config(transform_type, TX_32X16, &cfg);
6289
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6290
0
}
6291
6292
void svt_av1_fwd_txfm2d_16x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6293
0
                                   uint8_t bit_depth) {
6294
0
    int32_t       intermediate_transform_buffer[16 * 32];
6295
0
    Txfm2dFlipCfg cfg;
6296
0
    svt_aom_transform_config(transform_type, TX_16X32, &cfg);
6297
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6298
0
}
6299
6300
void svt_av1_fwd_txfm2d_16x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6301
0
                                  uint8_t bit_depth) {
6302
0
    int32_t       intermediate_transform_buffer[16 * 8];
6303
0
    Txfm2dFlipCfg cfg;
6304
0
    svt_aom_transform_config(transform_type, TX_16X8, &cfg);
6305
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6306
0
}
6307
6308
void svt_av1_fwd_txfm2d_8x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6309
0
                                  uint8_t bit_depth) {
6310
0
    int32_t       intermediate_transform_buffer[8 * 16];
6311
0
    Txfm2dFlipCfg cfg;
6312
0
    svt_aom_transform_config(transform_type, TX_8X16, &cfg);
6313
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6314
0
}
6315
6316
void svt_av1_fwd_txfm2d_32x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6317
0
                                  uint8_t bit_depth) {
6318
0
    int32_t       intermediate_transform_buffer[32 * 8];
6319
0
    Txfm2dFlipCfg cfg;
6320
0
    svt_aom_transform_config(transform_type, TX_32X8, &cfg);
6321
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6322
0
}
6323
6324
void svt_av1_fwd_txfm2d_8x32_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6325
0
                                  uint8_t bit_depth) {
6326
0
    int32_t       intermediate_transform_buffer[8 * 32];
6327
0
    Txfm2dFlipCfg cfg;
6328
0
    svt_aom_transform_config(transform_type, TX_8X32, &cfg);
6329
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6330
0
}
6331
6332
void svt_av1_fwd_txfm2d_16x4_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6333
0
                                  uint8_t bit_depth) {
6334
0
    int32_t       intermediate_transform_buffer[16 * 4];
6335
0
    Txfm2dFlipCfg cfg;
6336
0
    svt_aom_transform_config(transform_type, TX_16X4, &cfg);
6337
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6338
0
}
6339
6340
void svt_av1_fwd_txfm2d_4x16_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6341
0
                                  uint8_t bit_depth) {
6342
0
    int32_t       intermediate_transform_buffer[4 * 16];
6343
0
    Txfm2dFlipCfg cfg;
6344
0
    svt_aom_transform_config(transform_type, TX_4X16, &cfg);
6345
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6346
0
}
6347
6348
void svt_av1_fwd_txfm2d_8x4_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6349
0
                                 uint8_t bit_depth) {
6350
0
    int32_t       intermediate_transform_buffer[8 * 4];
6351
0
    Txfm2dFlipCfg cfg;
6352
0
    svt_aom_transform_config(transform_type, TX_8X4, &cfg);
6353
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6354
0
}
6355
6356
void svt_av1_fwd_txfm2d_4x8_N2_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
6357
0
                                 uint8_t bit_depth) {
6358
0
    int32_t       intermediate_transform_buffer[4 * 8];
6359
0
    Txfm2dFlipCfg cfg;
6360
0
    svt_aom_transform_config(transform_type, TX_4X8, &cfg);
6361
0
    av1_tranform_two_d_core_N2_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6362
0
}
6363
6364
0
void svt_av1_fdct4_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6365
0
    (void)stage_range;
6366
0
    const int32_t* cospi = cospi_arr(cos_bit);
6367
0
    int32_t        step[2];
6368
6369
    // stage 1;
6370
0
    step[0] = input[0] + input[3];
6371
0
    step[1] = input[1] + input[2];
6372
6373
0
    output[0] = half_btf(cospi[32], step[0], cospi[32], step[1], cos_bit);
6374
0
}
6375
6376
0
void svt_av1_fadst4_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6377
0
    (void)stage_range;
6378
0
    int32_t        bit   = cos_bit;
6379
0
    const int32_t* sinpi = sinpi_arr(bit);
6380
0
    int32_t        x0, x1, x2, x3;
6381
0
    int32_t        s0, s2, s4, s5;
6382
6383
    // stage 0
6384
0
    x0 = input[0];
6385
0
    x1 = input[1];
6386
0
    x2 = input[2];
6387
0
    x3 = input[3];
6388
6389
0
    if (!(x0 | x1 | x2 | x3)) {
6390
0
        output[0] = output[1] = output[2] = output[3] = 0;
6391
0
        return;
6392
0
    }
6393
6394
    // stage 1
6395
0
    s0 = sinpi[1] * x0;
6396
0
    s2 = sinpi[2] * x1;
6397
0
    s4 = sinpi[3] * x2;
6398
0
    s5 = sinpi[4] * x3;
6399
6400
    // stage 3
6401
0
    x0 = s0 + s2;
6402
6403
    // stage 4
6404
0
    x0 = x0 + s5;
6405
6406
    // stage 5
6407
0
    s0 = x0 + s4;
6408
6409
    // 1-D transform scaling factor is sqrt(2).
6410
0
    output[0] = round_shift(s0, bit);
6411
0
}
6412
6413
0
void svt_av1_fidentity4_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6414
0
    (void)stage_range;
6415
0
    (void)cos_bit;
6416
0
    output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits);
6417
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
6418
0
}
6419
6420
0
void svt_av1_fdct8_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6421
0
    (void)stage_range;
6422
0
    const int32_t* cospi;
6423
6424
0
    int32_t *bf0, *bf1;
6425
0
    int32_t  step[8];
6426
6427
    // stage 0;
6428
6429
    // stage 1;
6430
0
    bf1    = output;
6431
0
    bf1[0] = input[0] + input[7];
6432
0
    bf1[1] = input[1] + input[6];
6433
0
    bf1[2] = input[2] + input[5];
6434
0
    bf1[3] = input[3] + input[4];
6435
0
    bf1[4] = -input[4] + input[3];
6436
0
    bf1[5] = -input[5] + input[2];
6437
0
    bf1[6] = -input[6] + input[1];
6438
0
    bf1[7] = -input[7] + input[0];
6439
6440
    // stage 2
6441
0
    cospi  = cospi_arr(cos_bit);
6442
0
    bf0    = output;
6443
0
    bf1    = step;
6444
0
    bf1[0] = bf0[0] + bf0[3];
6445
0
    bf1[1] = bf0[1] + bf0[2];
6446
0
    bf1[4] = bf0[4];
6447
0
    bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6448
0
    bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6449
0
    bf1[7] = bf0[7];
6450
6451
    // stage 3
6452
0
    bf0    = step;
6453
0
    bf1    = output;
6454
0
    bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6455
0
    bf1[4] = bf0[4] + bf0[5];
6456
0
    bf1[7] = bf0[7] + bf0[6];
6457
6458
    // stage 4
6459
0
    bf0    = output;
6460
0
    bf1    = step;
6461
0
    bf1[0] = bf0[0];
6462
0
    bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6463
6464
    // stage 5
6465
0
    bf0    = step;
6466
0
    bf1    = output;
6467
0
    bf1[0] = bf0[0];
6468
0
    bf1[1] = bf0[4];
6469
0
}
6470
6471
0
void svt_av1_fadst8_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6472
0
    (void)stage_range;
6473
0
    const int32_t* cospi;
6474
6475
0
    int32_t *bf0, *bf1;
6476
0
    int32_t  step[8];
6477
6478
    // stage 0;
6479
6480
    // stage 1;
6481
0
    assert(output != input);
6482
0
    bf1    = output;
6483
0
    bf1[0] = input[0];
6484
0
    bf1[1] = -input[7];
6485
0
    bf1[2] = -input[3];
6486
0
    bf1[3] = input[4];
6487
0
    bf1[4] = -input[1];
6488
0
    bf1[5] = input[6];
6489
0
    bf1[6] = input[2];
6490
0
    bf1[7] = -input[5];
6491
6492
    // stage 2
6493
0
    cospi  = cospi_arr(cos_bit);
6494
0
    bf0    = output;
6495
0
    bf1    = step;
6496
0
    bf1[0] = bf0[0];
6497
0
    bf1[1] = bf0[1];
6498
0
    bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
6499
0
    bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
6500
0
    bf1[4] = bf0[4];
6501
0
    bf1[5] = bf0[5];
6502
0
    bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
6503
0
    bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
6504
6505
    // stage 3
6506
0
    bf0    = step;
6507
0
    bf1    = output;
6508
0
    bf1[0] = bf0[0] + bf0[2];
6509
0
    bf1[1] = bf0[1] + bf0[3];
6510
0
    bf1[2] = bf0[0] - bf0[2];
6511
0
    bf1[3] = bf0[1] - bf0[3];
6512
0
    bf1[4] = bf0[4] + bf0[6];
6513
0
    bf1[5] = bf0[5] + bf0[7];
6514
0
    bf1[6] = bf0[4] - bf0[6];
6515
0
    bf1[7] = bf0[5] - bf0[7];
6516
6517
    // stage 4
6518
0
    bf0    = output;
6519
0
    bf1    = step;
6520
0
    bf1[0] = bf0[0];
6521
0
    bf1[1] = bf0[1];
6522
0
    bf1[2] = bf0[2];
6523
0
    bf1[3] = bf0[3];
6524
0
    bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
6525
0
    bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
6526
0
    bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
6527
0
    bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
6528
6529
    // stage 5
6530
0
    bf0    = step;
6531
0
    bf1    = output;
6532
0
    bf1[0] = bf0[0] + bf0[4];
6533
0
    bf1[1] = bf0[1] + bf0[5];
6534
0
    bf1[6] = bf0[2] - bf0[6];
6535
0
    bf1[7] = bf0[3] - bf0[7];
6536
6537
    // stage 6
6538
0
    bf0    = output;
6539
0
    bf1    = step;
6540
0
    bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
6541
0
    bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
6542
6543
    // stage 7
6544
0
    bf0    = step;
6545
0
    bf1    = output;
6546
0
    bf1[0] = bf0[1];
6547
0
    bf1[1] = bf0[6];
6548
0
}
6549
6550
0
void svt_av1_fidentity8_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6551
0
    (void)stage_range;
6552
0
    (void)cos_bit;
6553
0
    for (int32_t i = 0; i < 2; ++i) {
6554
0
        output[i] = input[i] * 2;
6555
0
    }
6556
0
}
6557
6558
0
void svt_av1_fdct16_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6559
0
    (void)stage_range;
6560
0
    const int32_t* cospi;
6561
6562
0
    int32_t *bf0, *bf1;
6563
0
    int32_t  step[16];
6564
6565
    // stage 0;
6566
6567
    // stage 1;
6568
0
    bf1     = output;
6569
0
    bf1[0]  = input[0] + input[15];
6570
0
    bf1[1]  = input[1] + input[14];
6571
0
    bf1[2]  = input[2] + input[13];
6572
0
    bf1[3]  = input[3] + input[12];
6573
0
    bf1[4]  = input[4] + input[11];
6574
0
    bf1[5]  = input[5] + input[10];
6575
0
    bf1[6]  = input[6] + input[9];
6576
0
    bf1[7]  = input[7] + input[8];
6577
0
    bf1[8]  = -input[8] + input[7];
6578
0
    bf1[9]  = -input[9] + input[6];
6579
0
    bf1[10] = -input[10] + input[5];
6580
0
    bf1[11] = -input[11] + input[4];
6581
0
    bf1[12] = -input[12] + input[3];
6582
0
    bf1[13] = -input[13] + input[2];
6583
0
    bf1[14] = -input[14] + input[1];
6584
0
    bf1[15] = -input[15] + input[0];
6585
6586
    // stage 2
6587
0
    cospi   = cospi_arr(cos_bit);
6588
0
    bf0     = output;
6589
0
    bf1     = step;
6590
0
    bf1[0]  = bf0[0] + bf0[7];
6591
0
    bf1[1]  = bf0[1] + bf0[6];
6592
0
    bf1[2]  = bf0[2] + bf0[5];
6593
0
    bf1[3]  = bf0[3] + bf0[4];
6594
0
    bf1[4]  = -bf0[4] + bf0[3];
6595
0
    bf1[5]  = -bf0[5] + bf0[2];
6596
0
    bf1[6]  = -bf0[6] + bf0[1];
6597
0
    bf1[7]  = -bf0[7] + bf0[0];
6598
0
    bf1[8]  = bf0[8];
6599
0
    bf1[9]  = bf0[9];
6600
0
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
6601
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
6602
0
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
6603
0
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
6604
0
    bf1[14] = bf0[14];
6605
0
    bf1[15] = bf0[15];
6606
6607
    // stage 3
6608
0
    bf0     = step;
6609
0
    bf1     = output;
6610
0
    bf1[0]  = bf0[0] + bf0[3];
6611
0
    bf1[1]  = bf0[1] + bf0[2];
6612
0
    bf1[4]  = bf0[4];
6613
0
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6614
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6615
0
    bf1[7]  = bf0[7];
6616
0
    bf1[8]  = bf0[8] + bf0[11];
6617
0
    bf1[9]  = bf0[9] + bf0[10];
6618
0
    bf1[10] = -bf0[10] + bf0[9];
6619
0
    bf1[11] = -bf0[11] + bf0[8];
6620
0
    bf1[12] = -bf0[12] + bf0[15];
6621
0
    bf1[13] = -bf0[13] + bf0[14];
6622
0
    bf1[14] = bf0[14] + bf0[13];
6623
0
    bf1[15] = bf0[15] + bf0[12];
6624
6625
    // stage 4
6626
0
    bf0     = output;
6627
0
    bf1     = step;
6628
0
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6629
0
    bf1[4]  = bf0[4] + bf0[5];
6630
0
    bf1[7]  = bf0[7] + bf0[6];
6631
0
    bf1[8]  = bf0[8];
6632
0
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
6633
0
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
6634
0
    bf1[11] = bf0[11];
6635
0
    bf1[12] = bf0[12];
6636
0
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
6637
0
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
6638
0
    bf1[15] = bf0[15];
6639
6640
    // stage 5
6641
0
    bf0     = step;
6642
0
    bf1     = output;
6643
0
    bf1[0]  = bf0[0];
6644
0
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6645
0
    bf1[8]  = bf0[8] + bf0[9];
6646
0
    bf1[11] = bf0[11] + bf0[10];
6647
0
    bf1[12] = bf0[12] + bf0[13];
6648
0
    bf1[15] = bf0[15] + bf0[14];
6649
6650
    // stage 6
6651
0
    bf0     = output;
6652
0
    bf1     = step;
6653
0
    bf1[0]  = bf0[0];
6654
0
    bf1[4]  = bf0[4];
6655
0
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
6656
0
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
6657
6658
    // stage 7
6659
0
    bf0    = step;
6660
0
    bf1    = output;
6661
0
    bf1[0] = bf0[0];
6662
0
    bf1[1] = bf0[8];
6663
0
    bf1[2] = bf0[4];
6664
0
    bf1[3] = bf0[12];
6665
0
}
6666
6667
0
void svt_av1_fadst16_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6668
0
    (void)stage_range;
6669
0
    const int32_t* cospi;
6670
6671
0
    int32_t *bf0, *bf1;
6672
0
    int32_t  step[16];
6673
6674
    // stage 0;
6675
6676
    // stage 1;
6677
0
    assert(output != input);
6678
0
    bf1     = output;
6679
0
    bf1[0]  = input[0];
6680
0
    bf1[1]  = -input[15];
6681
0
    bf1[2]  = -input[7];
6682
0
    bf1[3]  = input[8];
6683
0
    bf1[4]  = -input[3];
6684
0
    bf1[5]  = input[12];
6685
0
    bf1[6]  = input[4];
6686
0
    bf1[7]  = -input[11];
6687
0
    bf1[8]  = -input[1];
6688
0
    bf1[9]  = input[14];
6689
0
    bf1[10] = input[6];
6690
0
    bf1[11] = -input[9];
6691
0
    bf1[12] = input[2];
6692
0
    bf1[13] = -input[13];
6693
0
    bf1[14] = -input[5];
6694
0
    bf1[15] = input[10];
6695
6696
    // stage 2
6697
0
    cospi   = cospi_arr(cos_bit);
6698
0
    bf0     = output;
6699
0
    bf1     = step;
6700
0
    bf1[0]  = bf0[0];
6701
0
    bf1[1]  = bf0[1];
6702
0
    bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
6703
0
    bf1[3]  = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
6704
0
    bf1[4]  = bf0[4];
6705
0
    bf1[5]  = bf0[5];
6706
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
6707
0
    bf1[7]  = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
6708
0
    bf1[8]  = bf0[8];
6709
0
    bf1[9]  = bf0[9];
6710
0
    bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
6711
0
    bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
6712
0
    bf1[12] = bf0[12];
6713
0
    bf1[13] = bf0[13];
6714
0
    bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
6715
0
    bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
6716
6717
    // stage 3
6718
0
    bf0     = step;
6719
0
    bf1     = output;
6720
0
    bf1[0]  = bf0[0] + bf0[2];
6721
0
    bf1[1]  = bf0[1] + bf0[3];
6722
0
    bf1[2]  = bf0[0] - bf0[2];
6723
0
    bf1[3]  = bf0[1] - bf0[3];
6724
0
    bf1[4]  = bf0[4] + bf0[6];
6725
0
    bf1[5]  = bf0[5] + bf0[7];
6726
0
    bf1[6]  = bf0[4] - bf0[6];
6727
0
    bf1[7]  = bf0[5] - bf0[7];
6728
0
    bf1[8]  = bf0[8] + bf0[10];
6729
0
    bf1[9]  = bf0[9] + bf0[11];
6730
0
    bf1[10] = bf0[8] - bf0[10];
6731
0
    bf1[11] = bf0[9] - bf0[11];
6732
0
    bf1[12] = bf0[12] + bf0[14];
6733
0
    bf1[13] = bf0[13] + bf0[15];
6734
0
    bf1[14] = bf0[12] - bf0[14];
6735
0
    bf1[15] = bf0[13] - bf0[15];
6736
6737
    // stage 4
6738
0
    bf0     = output;
6739
0
    bf1     = step;
6740
0
    bf1[0]  = bf0[0];
6741
0
    bf1[1]  = bf0[1];
6742
0
    bf1[2]  = bf0[2];
6743
0
    bf1[3]  = bf0[3];
6744
0
    bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
6745
0
    bf1[5]  = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
6746
0
    bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
6747
0
    bf1[7]  = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
6748
0
    bf1[8]  = bf0[8];
6749
0
    bf1[9]  = bf0[9];
6750
0
    bf1[10] = bf0[10];
6751
0
    bf1[11] = bf0[11];
6752
0
    bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
6753
0
    bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
6754
0
    bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
6755
0
    bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
6756
6757
    // stage 5
6758
0
    bf0     = step;
6759
0
    bf1     = output;
6760
0
    bf1[0]  = bf0[0] + bf0[4];
6761
0
    bf1[1]  = bf0[1] + bf0[5];
6762
0
    bf1[2]  = bf0[2] + bf0[6];
6763
0
    bf1[3]  = bf0[3] + bf0[7];
6764
0
    bf1[4]  = bf0[0] - bf0[4];
6765
0
    bf1[5]  = bf0[1] - bf0[5];
6766
0
    bf1[6]  = bf0[2] - bf0[6];
6767
0
    bf1[7]  = bf0[3] - bf0[7];
6768
0
    bf1[8]  = bf0[8] + bf0[12];
6769
0
    bf1[9]  = bf0[9] + bf0[13];
6770
0
    bf1[10] = bf0[10] + bf0[14];
6771
0
    bf1[11] = bf0[11] + bf0[15];
6772
0
    bf1[12] = bf0[8] - bf0[12];
6773
0
    bf1[13] = bf0[9] - bf0[13];
6774
0
    bf1[14] = bf0[10] - bf0[14];
6775
0
    bf1[15] = bf0[11] - bf0[15];
6776
6777
    // stage 6
6778
0
    bf0     = output;
6779
0
    bf1     = step;
6780
0
    bf1[0]  = bf0[0];
6781
0
    bf1[1]  = bf0[1];
6782
0
    bf1[2]  = bf0[2];
6783
0
    bf1[3]  = bf0[3];
6784
0
    bf1[4]  = bf0[4];
6785
0
    bf1[5]  = bf0[5];
6786
0
    bf1[6]  = bf0[6];
6787
0
    bf1[7]  = bf0[7];
6788
0
    bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
6789
0
    bf1[9]  = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
6790
0
    bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
6791
0
    bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
6792
0
    bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
6793
0
    bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
6794
0
    bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
6795
0
    bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
6796
6797
    // stage 7
6798
0
    bf0     = step;
6799
0
    bf1     = output;
6800
0
    bf1[0]  = bf0[0] + bf0[8];
6801
0
    bf1[1]  = bf0[1] + bf0[9];
6802
0
    bf1[2]  = bf0[2] + bf0[10];
6803
0
    bf1[3]  = bf0[3] + bf0[11];
6804
0
    bf1[12] = bf0[4] - bf0[12];
6805
0
    bf1[13] = bf0[5] - bf0[13];
6806
0
    bf1[14] = bf0[6] - bf0[14];
6807
0
    bf1[15] = bf0[7] - bf0[15];
6808
6809
    // stage 8
6810
0
    bf0     = output;
6811
0
    bf1     = step;
6812
0
    bf1[1]  = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
6813
0
    bf1[3]  = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
6814
0
    bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
6815
0
    bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
6816
6817
    // stage 9
6818
0
    bf0    = step;
6819
0
    bf1    = output;
6820
0
    bf1[0] = bf0[1];
6821
0
    bf1[1] = bf0[14];
6822
0
    bf1[2] = bf0[3];
6823
0
    bf1[3] = bf0[12];
6824
0
}
6825
6826
0
void svt_av1_fidentity16_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6827
0
    (void)stage_range;
6828
0
    (void)cos_bit;
6829
0
    for (int32_t i = 0; i < 4; ++i) {
6830
0
        output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
6831
0
    }
6832
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
6833
0
}
6834
6835
0
void svt_av1_fdct32_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
6836
0
    (void)stage_range;
6837
0
    const int32_t* cospi;
6838
6839
0
    int32_t *bf0, *bf1;
6840
0
    int32_t  step[32];
6841
6842
    // stage 0;
6843
6844
    // stage 1;
6845
0
    bf1     = output;
6846
0
    bf1[0]  = input[0] + input[31];
6847
0
    bf1[1]  = input[1] + input[30];
6848
0
    bf1[2]  = input[2] + input[29];
6849
0
    bf1[3]  = input[3] + input[28];
6850
0
    bf1[4]  = input[4] + input[27];
6851
0
    bf1[5]  = input[5] + input[26];
6852
0
    bf1[6]  = input[6] + input[25];
6853
0
    bf1[7]  = input[7] + input[24];
6854
0
    bf1[8]  = input[8] + input[23];
6855
0
    bf1[9]  = input[9] + input[22];
6856
0
    bf1[10] = input[10] + input[21];
6857
0
    bf1[11] = input[11] + input[20];
6858
0
    bf1[12] = input[12] + input[19];
6859
0
    bf1[13] = input[13] + input[18];
6860
0
    bf1[14] = input[14] + input[17];
6861
0
    bf1[15] = input[15] + input[16];
6862
0
    bf1[16] = -input[16] + input[15];
6863
0
    bf1[17] = -input[17] + input[14];
6864
0
    bf1[18] = -input[18] + input[13];
6865
0
    bf1[19] = -input[19] + input[12];
6866
0
    bf1[20] = -input[20] + input[11];
6867
0
    bf1[21] = -input[21] + input[10];
6868
0
    bf1[22] = -input[22] + input[9];
6869
0
    bf1[23] = -input[23] + input[8];
6870
0
    bf1[24] = -input[24] + input[7];
6871
0
    bf1[25] = -input[25] + input[6];
6872
0
    bf1[26] = -input[26] + input[5];
6873
0
    bf1[27] = -input[27] + input[4];
6874
0
    bf1[28] = -input[28] + input[3];
6875
0
    bf1[29] = -input[29] + input[2];
6876
0
    bf1[30] = -input[30] + input[1];
6877
0
    bf1[31] = -input[31] + input[0];
6878
6879
    // stage 2
6880
0
    cospi   = cospi_arr(cos_bit);
6881
0
    bf0     = output;
6882
0
    bf1     = step;
6883
0
    bf1[0]  = bf0[0] + bf0[15];
6884
0
    bf1[1]  = bf0[1] + bf0[14];
6885
0
    bf1[2]  = bf0[2] + bf0[13];
6886
0
    bf1[3]  = bf0[3] + bf0[12];
6887
0
    bf1[4]  = bf0[4] + bf0[11];
6888
0
    bf1[5]  = bf0[5] + bf0[10];
6889
0
    bf1[6]  = bf0[6] + bf0[9];
6890
0
    bf1[7]  = bf0[7] + bf0[8];
6891
0
    bf1[8]  = -bf0[8] + bf0[7];
6892
0
    bf1[9]  = -bf0[9] + bf0[6];
6893
0
    bf1[10] = -bf0[10] + bf0[5];
6894
0
    bf1[11] = -bf0[11] + bf0[4];
6895
0
    bf1[12] = -bf0[12] + bf0[3];
6896
0
    bf1[13] = -bf0[13] + bf0[2];
6897
0
    bf1[14] = -bf0[14] + bf0[1];
6898
0
    bf1[15] = -bf0[15] + bf0[0];
6899
0
    bf1[16] = bf0[16];
6900
0
    bf1[17] = bf0[17];
6901
0
    bf1[18] = bf0[18];
6902
0
    bf1[19] = bf0[19];
6903
0
    bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
6904
0
    bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
6905
0
    bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
6906
0
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
6907
0
    bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
6908
0
    bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
6909
0
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
6910
0
    bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
6911
0
    bf1[28] = bf0[28];
6912
0
    bf1[29] = bf0[29];
6913
0
    bf1[30] = bf0[30];
6914
0
    bf1[31] = bf0[31];
6915
6916
    // stage 3
6917
0
    bf0     = step;
6918
0
    bf1     = output;
6919
0
    bf1[0]  = bf0[0] + bf0[7];
6920
0
    bf1[1]  = bf0[1] + bf0[6];
6921
0
    bf1[2]  = bf0[2] + bf0[5];
6922
0
    bf1[3]  = bf0[3] + bf0[4];
6923
0
    bf1[4]  = -bf0[4] + bf0[3];
6924
0
    bf1[5]  = -bf0[5] + bf0[2];
6925
0
    bf1[6]  = -bf0[6] + bf0[1];
6926
0
    bf1[7]  = -bf0[7] + bf0[0];
6927
0
    bf1[8]  = bf0[8];
6928
0
    bf1[9]  = bf0[9];
6929
0
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
6930
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
6931
0
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
6932
0
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
6933
0
    bf1[14] = bf0[14];
6934
0
    bf1[15] = bf0[15];
6935
0
    bf1[16] = bf0[16] + bf0[23];
6936
0
    bf1[17] = bf0[17] + bf0[22];
6937
0
    bf1[18] = bf0[18] + bf0[21];
6938
0
    bf1[19] = bf0[19] + bf0[20];
6939
0
    bf1[20] = -bf0[20] + bf0[19];
6940
0
    bf1[21] = -bf0[21] + bf0[18];
6941
0
    bf1[22] = -bf0[22] + bf0[17];
6942
0
    bf1[23] = -bf0[23] + bf0[16];
6943
0
    bf1[24] = -bf0[24] + bf0[31];
6944
0
    bf1[25] = -bf0[25] + bf0[30];
6945
0
    bf1[26] = -bf0[26] + bf0[29];
6946
0
    bf1[27] = -bf0[27] + bf0[28];
6947
0
    bf1[28] = bf0[28] + bf0[27];
6948
0
    bf1[29] = bf0[29] + bf0[26];
6949
0
    bf1[30] = bf0[30] + bf0[25];
6950
0
    bf1[31] = bf0[31] + bf0[24];
6951
6952
    // stage 4
6953
0
    bf0     = output;
6954
0
    bf1     = step;
6955
0
    bf1[0]  = bf0[0] + bf0[3];
6956
0
    bf1[1]  = bf0[1] + bf0[2];
6957
0
    bf1[4]  = bf0[4];
6958
0
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6959
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6960
0
    bf1[7]  = bf0[7];
6961
0
    bf1[8]  = bf0[8] + bf0[11];
6962
0
    bf1[9]  = bf0[9] + bf0[10];
6963
0
    bf1[10] = -bf0[10] + bf0[9];
6964
0
    bf1[11] = -bf0[11] + bf0[8];
6965
0
    bf1[12] = -bf0[12] + bf0[15];
6966
0
    bf1[13] = -bf0[13] + bf0[14];
6967
0
    bf1[14] = bf0[14] + bf0[13];
6968
0
    bf1[15] = bf0[15] + bf0[12];
6969
0
    bf1[16] = bf0[16];
6970
0
    bf1[17] = bf0[17];
6971
0
    bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
6972
0
    bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
6973
0
    bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
6974
0
    bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
6975
0
    bf1[22] = bf0[22];
6976
0
    bf1[23] = bf0[23];
6977
0
    bf1[24] = bf0[24];
6978
0
    bf1[25] = bf0[25];
6979
0
    bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
6980
0
    bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
6981
0
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
6982
0
    bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
6983
0
    bf1[30] = bf0[30];
6984
0
    bf1[31] = bf0[31];
6985
6986
    // stage 5
6987
0
    bf0     = step;
6988
0
    bf1     = output;
6989
0
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6990
0
    bf1[4]  = bf0[4] + bf0[5];
6991
0
    bf1[7]  = bf0[7] + bf0[6];
6992
0
    bf1[8]  = bf0[8];
6993
0
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
6994
0
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
6995
0
    bf1[11] = bf0[11];
6996
0
    bf1[12] = bf0[12];
6997
0
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
6998
0
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
6999
0
    bf1[15] = bf0[15];
7000
0
    bf1[16] = bf0[16] + bf0[19];
7001
0
    bf1[17] = bf0[17] + bf0[18];
7002
0
    bf1[18] = -bf0[18] + bf0[17];
7003
0
    bf1[19] = -bf0[19] + bf0[16];
7004
0
    bf1[20] = -bf0[20] + bf0[23];
7005
0
    bf1[21] = -bf0[21] + bf0[22];
7006
0
    bf1[22] = bf0[22] + bf0[21];
7007
0
    bf1[23] = bf0[23] + bf0[20];
7008
0
    bf1[24] = bf0[24] + bf0[27];
7009
0
    bf1[25] = bf0[25] + bf0[26];
7010
0
    bf1[26] = -bf0[26] + bf0[25];
7011
0
    bf1[27] = -bf0[27] + bf0[24];
7012
0
    bf1[28] = -bf0[28] + bf0[31];
7013
0
    bf1[29] = -bf0[29] + bf0[30];
7014
0
    bf1[30] = bf0[30] + bf0[29];
7015
0
    bf1[31] = bf0[31] + bf0[28];
7016
7017
    // stage 6
7018
0
    bf0     = output;
7019
0
    bf1     = step;
7020
0
    bf1[0]  = bf0[0];
7021
0
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
7022
0
    bf1[8]  = bf0[8] + bf0[9];
7023
0
    bf1[11] = bf0[11] + bf0[10];
7024
0
    bf1[12] = bf0[12] + bf0[13];
7025
0
    bf1[15] = bf0[15] + bf0[14];
7026
0
    bf1[16] = bf0[16];
7027
0
    bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
7028
0
    bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
7029
0
    bf1[19] = bf0[19];
7030
0
    bf1[20] = bf0[20];
7031
0
    bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
7032
0
    bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
7033
0
    bf1[23] = bf0[23];
7034
0
    bf1[24] = bf0[24];
7035
0
    bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
7036
0
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
7037
0
    bf1[27] = bf0[27];
7038
0
    bf1[28] = bf0[28];
7039
0
    bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
7040
0
    bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
7041
0
    bf1[31] = bf0[31];
7042
7043
    // stage 7
7044
0
    bf0     = step;
7045
0
    bf1     = output;
7046
0
    bf1[0]  = bf0[0];
7047
0
    bf1[4]  = bf0[4];
7048
0
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
7049
0
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
7050
0
    bf1[16] = bf0[16] + bf0[17];
7051
0
    bf1[19] = bf0[19] + bf0[18];
7052
0
    bf1[20] = bf0[20] + bf0[21];
7053
0
    bf1[23] = bf0[23] + bf0[22];
7054
0
    bf1[24] = bf0[24] + bf0[25];
7055
0
    bf1[27] = bf0[27] + bf0[26];
7056
0
    bf1[28] = bf0[28] + bf0[29];
7057
0
    bf1[31] = bf0[31] + bf0[30];
7058
7059
    // stage 8
7060
0
    bf0     = output;
7061
0
    bf1     = step;
7062
0
    bf1[0]  = bf0[0];
7063
0
    bf1[4]  = bf0[4];
7064
0
    bf1[8]  = bf0[8];
7065
0
    bf1[12] = bf0[12];
7066
0
    bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
7067
0
    bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
7068
0
    bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
7069
0
    bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
7070
7071
    // stage 9
7072
0
    bf0    = step;
7073
0
    bf1    = output;
7074
0
    bf1[0] = bf0[0];
7075
0
    bf1[1] = bf0[16];
7076
0
    bf1[2] = bf0[8];
7077
0
    bf1[3] = bf0[24];
7078
0
    bf1[4] = bf0[4];
7079
0
    bf1[5] = bf0[20];
7080
0
    bf1[6] = bf0[12];
7081
0
    bf1[7] = bf0[28];
7082
0
}
7083
7084
0
void svt_av1_fidentity32_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
7085
0
    (void)stage_range;
7086
0
    (void)cos_bit;
7087
0
    for (int32_t i = 0; i < 8; ++i) {
7088
0
        output[i] = input[i] * 4;
7089
0
    }
7090
0
}
7091
7092
0
void svt_av1_fdct64_new_N4(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
7093
0
    (void)stage_range;
7094
0
    const int32_t* cospi;
7095
7096
0
    int32_t *bf0, *bf1;
7097
0
    int32_t  step[64];
7098
7099
    // stage 0;
7100
7101
    // stage 1;
7102
0
    bf1     = output;
7103
0
    bf1[0]  = input[0] + input[63];
7104
0
    bf1[1]  = input[1] + input[62];
7105
0
    bf1[2]  = input[2] + input[61];
7106
0
    bf1[3]  = input[3] + input[60];
7107
0
    bf1[4]  = input[4] + input[59];
7108
0
    bf1[5]  = input[5] + input[58];
7109
0
    bf1[6]  = input[6] + input[57];
7110
0
    bf1[7]  = input[7] + input[56];
7111
0
    bf1[8]  = input[8] + input[55];
7112
0
    bf1[9]  = input[9] + input[54];
7113
0
    bf1[10] = input[10] + input[53];
7114
0
    bf1[11] = input[11] + input[52];
7115
0
    bf1[12] = input[12] + input[51];
7116
0
    bf1[13] = input[13] + input[50];
7117
0
    bf1[14] = input[14] + input[49];
7118
0
    bf1[15] = input[15] + input[48];
7119
0
    bf1[16] = input[16] + input[47];
7120
0
    bf1[17] = input[17] + input[46];
7121
0
    bf1[18] = input[18] + input[45];
7122
0
    bf1[19] = input[19] + input[44];
7123
0
    bf1[20] = input[20] + input[43];
7124
0
    bf1[21] = input[21] + input[42];
7125
0
    bf1[22] = input[22] + input[41];
7126
0
    bf1[23] = input[23] + input[40];
7127
0
    bf1[24] = input[24] + input[39];
7128
0
    bf1[25] = input[25] + input[38];
7129
0
    bf1[26] = input[26] + input[37];
7130
0
    bf1[27] = input[27] + input[36];
7131
0
    bf1[28] = input[28] + input[35];
7132
0
    bf1[29] = input[29] + input[34];
7133
0
    bf1[30] = input[30] + input[33];
7134
0
    bf1[31] = input[31] + input[32];
7135
0
    bf1[32] = -input[32] + input[31];
7136
0
    bf1[33] = -input[33] + input[30];
7137
0
    bf1[34] = -input[34] + input[29];
7138
0
    bf1[35] = -input[35] + input[28];
7139
0
    bf1[36] = -input[36] + input[27];
7140
0
    bf1[37] = -input[37] + input[26];
7141
0
    bf1[38] = -input[38] + input[25];
7142
0
    bf1[39] = -input[39] + input[24];
7143
0
    bf1[40] = -input[40] + input[23];
7144
0
    bf1[41] = -input[41] + input[22];
7145
0
    bf1[42] = -input[42] + input[21];
7146
0
    bf1[43] = -input[43] + input[20];
7147
0
    bf1[44] = -input[44] + input[19];
7148
0
    bf1[45] = -input[45] + input[18];
7149
0
    bf1[46] = -input[46] + input[17];
7150
0
    bf1[47] = -input[47] + input[16];
7151
0
    bf1[48] = -input[48] + input[15];
7152
0
    bf1[49] = -input[49] + input[14];
7153
0
    bf1[50] = -input[50] + input[13];
7154
0
    bf1[51] = -input[51] + input[12];
7155
0
    bf1[52] = -input[52] + input[11];
7156
0
    bf1[53] = -input[53] + input[10];
7157
0
    bf1[54] = -input[54] + input[9];
7158
0
    bf1[55] = -input[55] + input[8];
7159
0
    bf1[56] = -input[56] + input[7];
7160
0
    bf1[57] = -input[57] + input[6];
7161
0
    bf1[58] = -input[58] + input[5];
7162
0
    bf1[59] = -input[59] + input[4];
7163
0
    bf1[60] = -input[60] + input[3];
7164
0
    bf1[61] = -input[61] + input[2];
7165
0
    bf1[62] = -input[62] + input[1];
7166
0
    bf1[63] = -input[63] + input[0];
7167
7168
    // stage 2
7169
0
    cospi   = cospi_arr(cos_bit);
7170
0
    bf0     = output;
7171
0
    bf1     = step;
7172
0
    bf1[0]  = bf0[0] + bf0[31];
7173
0
    bf1[1]  = bf0[1] + bf0[30];
7174
0
    bf1[2]  = bf0[2] + bf0[29];
7175
0
    bf1[3]  = bf0[3] + bf0[28];
7176
0
    bf1[4]  = bf0[4] + bf0[27];
7177
0
    bf1[5]  = bf0[5] + bf0[26];
7178
0
    bf1[6]  = bf0[6] + bf0[25];
7179
0
    bf1[7]  = bf0[7] + bf0[24];
7180
0
    bf1[8]  = bf0[8] + bf0[23];
7181
0
    bf1[9]  = bf0[9] + bf0[22];
7182
0
    bf1[10] = bf0[10] + bf0[21];
7183
0
    bf1[11] = bf0[11] + bf0[20];
7184
0
    bf1[12] = bf0[12] + bf0[19];
7185
0
    bf1[13] = bf0[13] + bf0[18];
7186
0
    bf1[14] = bf0[14] + bf0[17];
7187
0
    bf1[15] = bf0[15] + bf0[16];
7188
0
    bf1[16] = -bf0[16] + bf0[15];
7189
0
    bf1[17] = -bf0[17] + bf0[14];
7190
0
    bf1[18] = -bf0[18] + bf0[13];
7191
0
    bf1[19] = -bf0[19] + bf0[12];
7192
0
    bf1[20] = -bf0[20] + bf0[11];
7193
0
    bf1[21] = -bf0[21] + bf0[10];
7194
0
    bf1[22] = -bf0[22] + bf0[9];
7195
0
    bf1[23] = -bf0[23] + bf0[8];
7196
0
    bf1[24] = -bf0[24] + bf0[7];
7197
0
    bf1[25] = -bf0[25] + bf0[6];
7198
0
    bf1[26] = -bf0[26] + bf0[5];
7199
0
    bf1[27] = -bf0[27] + bf0[4];
7200
0
    bf1[28] = -bf0[28] + bf0[3];
7201
0
    bf1[29] = -bf0[29] + bf0[2];
7202
0
    bf1[30] = -bf0[30] + bf0[1];
7203
0
    bf1[31] = -bf0[31] + bf0[0];
7204
0
    bf1[32] = bf0[32];
7205
0
    bf1[33] = bf0[33];
7206
0
    bf1[34] = bf0[34];
7207
0
    bf1[35] = bf0[35];
7208
0
    bf1[36] = bf0[36];
7209
0
    bf1[37] = bf0[37];
7210
0
    bf1[38] = bf0[38];
7211
0
    bf1[39] = bf0[39];
7212
0
    bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
7213
0
    bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
7214
0
    bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
7215
0
    bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
7216
0
    bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
7217
0
    bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
7218
0
    bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
7219
0
    bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
7220
0
    bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
7221
0
    bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
7222
0
    bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
7223
0
    bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
7224
0
    bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
7225
0
    bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
7226
0
    bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
7227
0
    bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
7228
0
    bf1[56] = bf0[56];
7229
0
    bf1[57] = bf0[57];
7230
0
    bf1[58] = bf0[58];
7231
0
    bf1[59] = bf0[59];
7232
0
    bf1[60] = bf0[60];
7233
0
    bf1[61] = bf0[61];
7234
0
    bf1[62] = bf0[62];
7235
0
    bf1[63] = bf0[63];
7236
7237
    // stage 3
7238
0
    cospi   = cospi_arr(cos_bit);
7239
0
    bf0     = step;
7240
0
    bf1     = output;
7241
0
    bf1[0]  = bf0[0] + bf0[15];
7242
0
    bf1[1]  = bf0[1] + bf0[14];
7243
0
    bf1[2]  = bf0[2] + bf0[13];
7244
0
    bf1[3]  = bf0[3] + bf0[12];
7245
0
    bf1[4]  = bf0[4] + bf0[11];
7246
0
    bf1[5]  = bf0[5] + bf0[10];
7247
0
    bf1[6]  = bf0[6] + bf0[9];
7248
0
    bf1[7]  = bf0[7] + bf0[8];
7249
0
    bf1[8]  = -bf0[8] + bf0[7];
7250
0
    bf1[9]  = -bf0[9] + bf0[6];
7251
0
    bf1[10] = -bf0[10] + bf0[5];
7252
0
    bf1[11] = -bf0[11] + bf0[4];
7253
0
    bf1[12] = -bf0[12] + bf0[3];
7254
0
    bf1[13] = -bf0[13] + bf0[2];
7255
0
    bf1[14] = -bf0[14] + bf0[1];
7256
0
    bf1[15] = -bf0[15] + bf0[0];
7257
0
    bf1[16] = bf0[16];
7258
0
    bf1[17] = bf0[17];
7259
0
    bf1[18] = bf0[18];
7260
0
    bf1[19] = bf0[19];
7261
0
    bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
7262
0
    bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
7263
0
    bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
7264
0
    bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
7265
0
    bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
7266
0
    bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
7267
0
    bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
7268
0
    bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
7269
0
    bf1[28] = bf0[28];
7270
0
    bf1[29] = bf0[29];
7271
0
    bf1[30] = bf0[30];
7272
0
    bf1[31] = bf0[31];
7273
0
    bf1[32] = bf0[32] + bf0[47];
7274
0
    bf1[33] = bf0[33] + bf0[46];
7275
0
    bf1[34] = bf0[34] + bf0[45];
7276
0
    bf1[35] = bf0[35] + bf0[44];
7277
0
    bf1[36] = bf0[36] + bf0[43];
7278
0
    bf1[37] = bf0[37] + bf0[42];
7279
0
    bf1[38] = bf0[38] + bf0[41];
7280
0
    bf1[39] = bf0[39] + bf0[40];
7281
0
    bf1[40] = -bf0[40] + bf0[39];
7282
0
    bf1[41] = -bf0[41] + bf0[38];
7283
0
    bf1[42] = -bf0[42] + bf0[37];
7284
0
    bf1[43] = -bf0[43] + bf0[36];
7285
0
    bf1[44] = -bf0[44] + bf0[35];
7286
0
    bf1[45] = -bf0[45] + bf0[34];
7287
0
    bf1[46] = -bf0[46] + bf0[33];
7288
0
    bf1[47] = -bf0[47] + bf0[32];
7289
0
    bf1[48] = -bf0[48] + bf0[63];
7290
0
    bf1[49] = -bf0[49] + bf0[62];
7291
0
    bf1[50] = -bf0[50] + bf0[61];
7292
0
    bf1[51] = -bf0[51] + bf0[60];
7293
0
    bf1[52] = -bf0[52] + bf0[59];
7294
0
    bf1[53] = -bf0[53] + bf0[58];
7295
0
    bf1[54] = -bf0[54] + bf0[57];
7296
0
    bf1[55] = -bf0[55] + bf0[56];
7297
0
    bf1[56] = bf0[56] + bf0[55];
7298
0
    bf1[57] = bf0[57] + bf0[54];
7299
0
    bf1[58] = bf0[58] + bf0[53];
7300
0
    bf1[59] = bf0[59] + bf0[52];
7301
0
    bf1[60] = bf0[60] + bf0[51];
7302
0
    bf1[61] = bf0[61] + bf0[50];
7303
0
    bf1[62] = bf0[62] + bf0[49];
7304
0
    bf1[63] = bf0[63] + bf0[48];
7305
7306
    // stage 4
7307
0
    cospi   = cospi_arr(cos_bit);
7308
0
    bf0     = output;
7309
0
    bf1     = step;
7310
0
    bf1[0]  = bf0[0] + bf0[7];
7311
0
    bf1[1]  = bf0[1] + bf0[6];
7312
0
    bf1[2]  = bf0[2] + bf0[5];
7313
0
    bf1[3]  = bf0[3] + bf0[4];
7314
0
    bf1[4]  = -bf0[4] + bf0[3];
7315
0
    bf1[5]  = -bf0[5] + bf0[2];
7316
0
    bf1[6]  = -bf0[6] + bf0[1];
7317
0
    bf1[7]  = -bf0[7] + bf0[0];
7318
0
    bf1[8]  = bf0[8];
7319
0
    bf1[9]  = bf0[9];
7320
0
    bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
7321
0
    bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
7322
0
    bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
7323
0
    bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
7324
0
    bf1[14] = bf0[14];
7325
0
    bf1[15] = bf0[15];
7326
0
    bf1[16] = bf0[16] + bf0[23];
7327
0
    bf1[17] = bf0[17] + bf0[22];
7328
0
    bf1[18] = bf0[18] + bf0[21];
7329
0
    bf1[19] = bf0[19] + bf0[20];
7330
0
    bf1[20] = -bf0[20] + bf0[19];
7331
0
    bf1[21] = -bf0[21] + bf0[18];
7332
0
    bf1[22] = -bf0[22] + bf0[17];
7333
0
    bf1[23] = -bf0[23] + bf0[16];
7334
0
    bf1[24] = -bf0[24] + bf0[31];
7335
0
    bf1[25] = -bf0[25] + bf0[30];
7336
0
    bf1[26] = -bf0[26] + bf0[29];
7337
0
    bf1[27] = -bf0[27] + bf0[28];
7338
0
    bf1[28] = bf0[28] + bf0[27];
7339
0
    bf1[29] = bf0[29] + bf0[26];
7340
0
    bf1[30] = bf0[30] + bf0[25];
7341
0
    bf1[31] = bf0[31] + bf0[24];
7342
0
    bf1[32] = bf0[32];
7343
0
    bf1[33] = bf0[33];
7344
0
    bf1[34] = bf0[34];
7345
0
    bf1[35] = bf0[35];
7346
0
    bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
7347
0
    bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
7348
0
    bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
7349
0
    bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
7350
0
    bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
7351
0
    bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
7352
0
    bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
7353
0
    bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
7354
0
    bf1[44] = bf0[44];
7355
0
    bf1[45] = bf0[45];
7356
0
    bf1[46] = bf0[46];
7357
0
    bf1[47] = bf0[47];
7358
0
    bf1[48] = bf0[48];
7359
0
    bf1[49] = bf0[49];
7360
0
    bf1[50] = bf0[50];
7361
0
    bf1[51] = bf0[51];
7362
0
    bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
7363
0
    bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
7364
0
    bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
7365
0
    bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
7366
0
    bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
7367
0
    bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
7368
0
    bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
7369
0
    bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
7370
0
    bf1[60] = bf0[60];
7371
0
    bf1[61] = bf0[61];
7372
0
    bf1[62] = bf0[62];
7373
0
    bf1[63] = bf0[63];
7374
7375
    // stage 5
7376
0
    cospi   = cospi_arr(cos_bit);
7377
0
    bf0     = step;
7378
0
    bf1     = output;
7379
0
    bf1[0]  = bf0[0] + bf0[3];
7380
0
    bf1[1]  = bf0[1] + bf0[2];
7381
0
    bf1[4]  = bf0[4];
7382
0
    bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
7383
0
    bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
7384
0
    bf1[7]  = bf0[7];
7385
0
    bf1[8]  = bf0[8] + bf0[11];
7386
0
    bf1[9]  = bf0[9] + bf0[10];
7387
0
    bf1[10] = -bf0[10] + bf0[9];
7388
0
    bf1[11] = -bf0[11] + bf0[8];
7389
0
    bf1[12] = -bf0[12] + bf0[15];
7390
0
    bf1[13] = -bf0[13] + bf0[14];
7391
0
    bf1[14] = bf0[14] + bf0[13];
7392
0
    bf1[15] = bf0[15] + bf0[12];
7393
0
    bf1[16] = bf0[16];
7394
0
    bf1[17] = bf0[17];
7395
0
    bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
7396
0
    bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
7397
0
    bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
7398
0
    bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
7399
0
    bf1[22] = bf0[22];
7400
0
    bf1[23] = bf0[23];
7401
0
    bf1[24] = bf0[24];
7402
0
    bf1[25] = bf0[25];
7403
0
    bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
7404
0
    bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
7405
0
    bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
7406
0
    bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
7407
0
    bf1[30] = bf0[30];
7408
0
    bf1[31] = bf0[31];
7409
0
    bf1[32] = bf0[32] + bf0[39];
7410
0
    bf1[33] = bf0[33] + bf0[38];
7411
0
    bf1[34] = bf0[34] + bf0[37];
7412
0
    bf1[35] = bf0[35] + bf0[36];
7413
0
    bf1[36] = -bf0[36] + bf0[35];
7414
0
    bf1[37] = -bf0[37] + bf0[34];
7415
0
    bf1[38] = -bf0[38] + bf0[33];
7416
0
    bf1[39] = -bf0[39] + bf0[32];
7417
0
    bf1[40] = -bf0[40] + bf0[47];
7418
0
    bf1[41] = -bf0[41] + bf0[46];
7419
0
    bf1[42] = -bf0[42] + bf0[45];
7420
0
    bf1[43] = -bf0[43] + bf0[44];
7421
0
    bf1[44] = bf0[44] + bf0[43];
7422
0
    bf1[45] = bf0[45] + bf0[42];
7423
0
    bf1[46] = bf0[46] + bf0[41];
7424
0
    bf1[47] = bf0[47] + bf0[40];
7425
0
    bf1[48] = bf0[48] + bf0[55];
7426
0
    bf1[49] = bf0[49] + bf0[54];
7427
0
    bf1[50] = bf0[50] + bf0[53];
7428
0
    bf1[51] = bf0[51] + bf0[52];
7429
0
    bf1[52] = -bf0[52] + bf0[51];
7430
0
    bf1[53] = -bf0[53] + bf0[50];
7431
0
    bf1[54] = -bf0[54] + bf0[49];
7432
0
    bf1[55] = -bf0[55] + bf0[48];
7433
0
    bf1[56] = -bf0[56] + bf0[63];
7434
0
    bf1[57] = -bf0[57] + bf0[62];
7435
0
    bf1[58] = -bf0[58] + bf0[61];
7436
0
    bf1[59] = -bf0[59] + bf0[60];
7437
0
    bf1[60] = bf0[60] + bf0[59];
7438
0
    bf1[61] = bf0[61] + bf0[58];
7439
0
    bf1[62] = bf0[62] + bf0[57];
7440
0
    bf1[63] = bf0[63] + bf0[56];
7441
7442
    // stage 6
7443
0
    cospi   = cospi_arr(cos_bit);
7444
0
    bf0     = output;
7445
0
    bf1     = step;
7446
0
    bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
7447
0
    bf1[4]  = bf0[4] + bf0[5];
7448
0
    bf1[7]  = bf0[7] + bf0[6];
7449
0
    bf1[8]  = bf0[8];
7450
0
    bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
7451
0
    bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
7452
0
    bf1[11] = bf0[11];
7453
0
    bf1[12] = bf0[12];
7454
0
    bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
7455
0
    bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
7456
0
    bf1[15] = bf0[15];
7457
0
    bf1[16] = bf0[16] + bf0[19];
7458
0
    bf1[17] = bf0[17] + bf0[18];
7459
0
    bf1[18] = -bf0[18] + bf0[17];
7460
0
    bf1[19] = -bf0[19] + bf0[16];
7461
0
    bf1[20] = -bf0[20] + bf0[23];
7462
0
    bf1[21] = -bf0[21] + bf0[22];
7463
0
    bf1[22] = bf0[22] + bf0[21];
7464
0
    bf1[23] = bf0[23] + bf0[20];
7465
0
    bf1[24] = bf0[24] + bf0[27];
7466
0
    bf1[25] = bf0[25] + bf0[26];
7467
0
    bf1[26] = -bf0[26] + bf0[25];
7468
0
    bf1[27] = -bf0[27] + bf0[24];
7469
0
    bf1[28] = -bf0[28] + bf0[31];
7470
0
    bf1[29] = -bf0[29] + bf0[30];
7471
0
    bf1[30] = bf0[30] + bf0[29];
7472
0
    bf1[31] = bf0[31] + bf0[28];
7473
0
    bf1[32] = bf0[32];
7474
0
    bf1[33] = bf0[33];
7475
0
    bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
7476
0
    bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
7477
0
    bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
7478
0
    bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
7479
0
    bf1[38] = bf0[38];
7480
0
    bf1[39] = bf0[39];
7481
0
    bf1[40] = bf0[40];
7482
0
    bf1[41] = bf0[41];
7483
0
    bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
7484
0
    bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
7485
0
    bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
7486
0
    bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
7487
0
    bf1[46] = bf0[46];
7488
0
    bf1[47] = bf0[47];
7489
0
    bf1[48] = bf0[48];
7490
0
    bf1[49] = bf0[49];
7491
0
    bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
7492
0
    bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
7493
0
    bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
7494
0
    bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
7495
0
    bf1[54] = bf0[54];
7496
0
    bf1[55] = bf0[55];
7497
0
    bf1[56] = bf0[56];
7498
0
    bf1[57] = bf0[57];
7499
0
    bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
7500
0
    bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
7501
0
    bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
7502
0
    bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
7503
0
    bf1[62] = bf0[62];
7504
0
    bf1[63] = bf0[63];
7505
7506
    // stage 7
7507
0
    cospi   = cospi_arr(cos_bit);
7508
0
    bf0     = step;
7509
0
    bf1     = output;
7510
0
    bf1[0]  = bf0[0];
7511
0
    bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
7512
0
    bf1[8]  = bf0[8] + bf0[9];
7513
0
    bf1[11] = bf0[11] + bf0[10];
7514
0
    bf1[12] = bf0[12] + bf0[13];
7515
0
    bf1[15] = bf0[15] + bf0[14];
7516
0
    bf1[16] = bf0[16];
7517
0
    bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
7518
0
    bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
7519
0
    bf1[19] = bf0[19];
7520
0
    bf1[20] = bf0[20];
7521
0
    bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
7522
0
    bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
7523
0
    bf1[23] = bf0[23];
7524
0
    bf1[24] = bf0[24];
7525
0
    bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
7526
0
    bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
7527
0
    bf1[27] = bf0[27];
7528
0
    bf1[28] = bf0[28];
7529
0
    bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
7530
0
    bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
7531
0
    bf1[31] = bf0[31];
7532
0
    bf1[32] = bf0[32] + bf0[35];
7533
0
    bf1[33] = bf0[33] + bf0[34];
7534
0
    bf1[34] = -bf0[34] + bf0[33];
7535
0
    bf1[35] = -bf0[35] + bf0[32];
7536
0
    bf1[36] = -bf0[36] + bf0[39];
7537
0
    bf1[37] = -bf0[37] + bf0[38];
7538
0
    bf1[38] = bf0[38] + bf0[37];
7539
0
    bf1[39] = bf0[39] + bf0[36];
7540
0
    bf1[40] = bf0[40] + bf0[43];
7541
0
    bf1[41] = bf0[41] + bf0[42];
7542
0
    bf1[42] = -bf0[42] + bf0[41];
7543
0
    bf1[43] = -bf0[43] + bf0[40];
7544
0
    bf1[44] = -bf0[44] + bf0[47];
7545
0
    bf1[45] = -bf0[45] + bf0[46];
7546
0
    bf1[46] = bf0[46] + bf0[45];
7547
0
    bf1[47] = bf0[47] + bf0[44];
7548
0
    bf1[48] = bf0[48] + bf0[51];
7549
0
    bf1[49] = bf0[49] + bf0[50];
7550
0
    bf1[50] = -bf0[50] + bf0[49];
7551
0
    bf1[51] = -bf0[51] + bf0[48];
7552
0
    bf1[52] = -bf0[52] + bf0[55];
7553
0
    bf1[53] = -bf0[53] + bf0[54];
7554
0
    bf1[54] = bf0[54] + bf0[53];
7555
0
    bf1[55] = bf0[55] + bf0[52];
7556
0
    bf1[56] = bf0[56] + bf0[59];
7557
0
    bf1[57] = bf0[57] + bf0[58];
7558
0
    bf1[58] = -bf0[58] + bf0[57];
7559
0
    bf1[59] = -bf0[59] + bf0[56];
7560
0
    bf1[60] = -bf0[60] + bf0[63];
7561
0
    bf1[61] = -bf0[61] + bf0[62];
7562
0
    bf1[62] = bf0[62] + bf0[61];
7563
0
    bf1[63] = bf0[63] + bf0[60];
7564
7565
    // stage 8
7566
0
    cospi   = cospi_arr(cos_bit);
7567
0
    bf0     = output;
7568
0
    bf1     = step;
7569
0
    bf1[0]  = bf0[0];
7570
0
    bf1[4]  = bf0[4];
7571
0
    bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
7572
0
    bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
7573
0
    bf1[16] = bf0[16] + bf0[17];
7574
0
    bf1[19] = bf0[19] + bf0[18];
7575
0
    bf1[20] = bf0[20] + bf0[21];
7576
0
    bf1[23] = bf0[23] + bf0[22];
7577
0
    bf1[24] = bf0[24] + bf0[25];
7578
0
    bf1[27] = bf0[27] + bf0[26];
7579
0
    bf1[28] = bf0[28] + bf0[29];
7580
0
    bf1[31] = bf0[31] + bf0[30];
7581
0
    bf1[32] = bf0[32];
7582
0
    bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
7583
0
    bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
7584
0
    bf1[35] = bf0[35];
7585
0
    bf1[36] = bf0[36];
7586
0
    bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
7587
0
    bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
7588
0
    bf1[39] = bf0[39];
7589
0
    bf1[40] = bf0[40];
7590
0
    bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
7591
0
    bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
7592
0
    bf1[43] = bf0[43];
7593
0
    bf1[44] = bf0[44];
7594
0
    bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
7595
0
    bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
7596
0
    bf1[47] = bf0[47];
7597
0
    bf1[48] = bf0[48];
7598
0
    bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
7599
0
    bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
7600
0
    bf1[51] = bf0[51];
7601
0
    bf1[52] = bf0[52];
7602
0
    bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
7603
0
    bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
7604
0
    bf1[55] = bf0[55];
7605
0
    bf1[56] = bf0[56];
7606
0
    bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
7607
0
    bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
7608
0
    bf1[59] = bf0[59];
7609
0
    bf1[60] = bf0[60];
7610
0
    bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
7611
0
    bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
7612
0
    bf1[63] = bf0[63];
7613
7614
    // stage 9
7615
0
    cospi   = cospi_arr(cos_bit);
7616
0
    bf0     = step;
7617
0
    bf1     = output;
7618
0
    bf1[0]  = bf0[0];
7619
0
    bf1[4]  = bf0[4];
7620
0
    bf1[8]  = bf0[8];
7621
0
    bf1[12] = bf0[12];
7622
0
    bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
7623
0
    bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
7624
0
    bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
7625
0
    bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
7626
0
    bf1[32] = bf0[32] + bf0[33];
7627
0
    bf1[35] = bf0[35] + bf0[34];
7628
0
    bf1[36] = bf0[36] + bf0[37];
7629
0
    bf1[39] = bf0[39] + bf0[38];
7630
0
    bf1[40] = bf0[40] + bf0[41];
7631
0
    bf1[43] = bf0[43] + bf0[42];
7632
0
    bf1[44] = bf0[44] + bf0[45];
7633
0
    bf1[47] = bf0[47] + bf0[46];
7634
0
    bf1[48] = bf0[48] + bf0[49];
7635
0
    bf1[51] = bf0[51] + bf0[50];
7636
0
    bf1[52] = bf0[52] + bf0[53];
7637
0
    bf1[55] = bf0[55] + bf0[54];
7638
0
    bf1[56] = bf0[56] + bf0[57];
7639
0
    bf1[59] = bf0[59] + bf0[58];
7640
0
    bf1[60] = bf0[60] + bf0[61];
7641
0
    bf1[63] = bf0[63] + bf0[62];
7642
7643
    // stage 10
7644
0
    cospi   = cospi_arr(cos_bit);
7645
0
    bf0     = output;
7646
0
    bf1     = step;
7647
0
    bf1[0]  = bf0[0];
7648
0
    bf1[4]  = bf0[4];
7649
0
    bf1[8]  = bf0[8];
7650
0
    bf1[12] = bf0[12];
7651
0
    bf1[16] = bf0[16];
7652
0
    bf1[20] = bf0[20];
7653
0
    bf1[24] = bf0[24];
7654
0
    bf1[28] = bf0[28];
7655
0
    bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
7656
0
    bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
7657
0
    bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
7658
0
    bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
7659
0
    bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
7660
0
    bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
7661
0
    bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
7662
0
    bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
7663
7664
    // stage 11
7665
0
    bf0     = step;
7666
0
    bf1     = output;
7667
0
    bf1[0]  = bf0[0];
7668
0
    bf1[1]  = bf0[32];
7669
0
    bf1[2]  = bf0[16];
7670
0
    bf1[3]  = bf0[48];
7671
0
    bf1[4]  = bf0[8];
7672
0
    bf1[5]  = bf0[40];
7673
0
    bf1[6]  = bf0[24];
7674
0
    bf1[7]  = bf0[56];
7675
0
    bf1[8]  = bf0[4];
7676
0
    bf1[9]  = bf0[36];
7677
0
    bf1[10] = bf0[20];
7678
0
    bf1[11] = bf0[52];
7679
0
    bf1[12] = bf0[12];
7680
0
    bf1[13] = bf0[44];
7681
0
    bf1[14] = bf0[28];
7682
0
    bf1[15] = bf0[60];
7683
0
}
7684
7685
0
static void av1_fidentity64_N4_c(const int32_t* input, int32_t* output, int8_t cos_bit, const int8_t* stage_range) {
7686
0
    (void)stage_range;
7687
0
    (void)cos_bit;
7688
0
    for (int32_t i = 0; i < 16; ++i) {
7689
0
        output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
7690
0
    }
7691
0
    assert(stage_range[0] + new_sqrt2_bits <= 32);
7692
0
}
7693
7694
0
static INLINE TxfmFunc fwd_txfm_type_to_func_N4(TxfmType txfmtype) {
7695
0
    switch (txfmtype) {
7696
0
    case TXFM_TYPE_DCT4:
7697
0
        return svt_av1_fdct4_new_N4;
7698
0
    case TXFM_TYPE_DCT8:
7699
0
        return svt_av1_fdct8_new_N4;
7700
0
    case TXFM_TYPE_DCT16:
7701
0
        return svt_av1_fdct16_new_N4;
7702
0
    case TXFM_TYPE_DCT32:
7703
0
        return svt_av1_fdct32_new_N4;
7704
0
    case TXFM_TYPE_DCT64:
7705
0
        return svt_av1_fdct64_new_N4;
7706
0
    case TXFM_TYPE_ADST4:
7707
0
        return svt_av1_fadst4_new_N4;
7708
0
    case TXFM_TYPE_ADST8:
7709
0
        return svt_av1_fadst8_new_N4;
7710
0
    case TXFM_TYPE_ADST16:
7711
0
        return svt_av1_fadst16_new_N4;
7712
0
    case TXFM_TYPE_ADST32:
7713
0
        return av1_fadst32_new;
7714
0
    case TXFM_TYPE_IDENTITY4:
7715
0
        return svt_av1_fidentity4_N4_c;
7716
0
    case TXFM_TYPE_IDENTITY8:
7717
0
        return svt_av1_fidentity8_N4_c;
7718
0
    case TXFM_TYPE_IDENTITY16:
7719
0
        return svt_av1_fidentity16_N4_c;
7720
0
    case TXFM_TYPE_IDENTITY32:
7721
0
        return svt_av1_fidentity32_N4_c;
7722
0
    case TXFM_TYPE_IDENTITY64:
7723
0
        return av1_fidentity64_N4_c;
7724
0
    default:
7725
0
        assert(0);
7726
0
        return NULL;
7727
0
    }
7728
0
}
7729
7730
static INLINE void av1_tranform_two_d_core_N4_c(int16_t* input, uint32_t input_stride, int32_t* output,
7731
0
                                                const Txfm2dFlipCfg* cfg, int32_t* buf, uint8_t bit_depth) {
7732
0
    int32_t c, r;
7733
    // Note when assigning txfm_size_col, we use the txfm_size from the
7734
    // row configuration and vice versa. This is intentionally done to
7735
    // accurately perform rectangular transforms. When the transform is
7736
    // rectangular, the number of columns will be the same as the
7737
    // txfm_size stored in the row cfg struct. It will make no difference
7738
    // for square transforms.
7739
0
    const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
7740
0
    const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
7741
    // Take the shift from the larger dimension in the rectangular case.
7742
0
    const int8_t* shift     = cfg->shift;
7743
0
    const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
7744
0
    int8_t        stage_range_col[MAX_TXFM_STAGE_NUM];
7745
0
    int8_t        stage_range_row[MAX_TXFM_STAGE_NUM];
7746
0
    assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
7747
0
    assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
7748
0
    svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
7749
7750
0
    const int8_t   cos_bit_col   = cfg->cos_bit_col;
7751
0
    const int8_t   cos_bit_row   = cfg->cos_bit_row;
7752
0
    const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N4(cfg->txfm_type_col);
7753
0
    const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N4(cfg->txfm_type_row);
7754
0
    ASSERT(txfm_func_col != NULL);
7755
0
    ASSERT(txfm_func_row != NULL);
7756
    // use output buffer as temp buffer
7757
0
    int32_t* temp_in  = output;
7758
0
    int32_t* temp_out = output + txfm_size_row;
7759
7760
    // Columns
7761
0
    for (c = 0; c < txfm_size_col; ++c) {
7762
0
        if (cfg->ud_flip == 0) {
7763
0
            for (r = 0; r < txfm_size_row; ++r) {
7764
0
                temp_in[r] = input[r * input_stride + c];
7765
0
            }
7766
0
        } else {
7767
0
            for (r = 0; r < txfm_size_row; ++r) {
7768
                // flip upside down
7769
0
                temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
7770
0
            }
7771
0
        }
7772
0
        svt_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
7773
0
        txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
7774
0
        svt_av1_round_shift_array_c(temp_out, txfm_size_row / 4, -shift[1]); // NM svt_av1_round_shift_array_c
7775
0
        if (cfg->lr_flip == 0) {
7776
0
            for (r = 0; r < txfm_size_row; ++r) {
7777
0
                buf[r * txfm_size_col + c] = temp_out[r];
7778
0
            }
7779
0
        } else {
7780
0
            for (r = 0; r < txfm_size_row; ++r) {
7781
                // flip from left to right
7782
0
                buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
7783
0
            }
7784
0
        }
7785
0
    }
7786
7787
    // Rows
7788
0
    for (r = 0; r < txfm_size_row / 4; ++r) {
7789
0
        txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
7790
0
        svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 4, -shift[2]);
7791
7792
0
        if (abs(rect_type) == 1) {
7793
            // Multiply everything by Sqrt2 if the transform is rectangular and the
7794
            // size difference is a factor of 2.
7795
0
            for (c = 0; c < txfm_size_col / 4; ++c) {
7796
0
                output[r * txfm_size_col + c] = round_shift((int64_t)output[r * txfm_size_col + c] * new_sqrt2,
7797
0
                                                            new_sqrt2_bits);
7798
0
            }
7799
0
        }
7800
0
    }
7801
0
    for (int i = 0; i < (txfm_size_col * txfm_size_row); i++) {
7802
0
        if (i % txfm_size_col >= (txfm_size_col >> 2) || i / txfm_size_col >= (txfm_size_row >> 2)) {
7803
0
            output[i] = 0;
7804
0
        }
7805
0
    }
7806
0
}
7807
7808
void svt_aom_transform_two_d_64x64_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7809
0
                                        uint8_t bit_depth) {
7810
0
    int32_t       intermediate_transform_buffer[64 * 64];
7811
0
    Txfm2dFlipCfg cfg;
7812
0
    svt_aom_transform_config(transform_type, TX_64X64, &cfg);
7813
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7814
0
}
7815
7816
void svt_aom_transform_two_d_32x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7817
0
                                        uint8_t bit_depth) {
7818
0
    int32_t       intermediate_transform_buffer[32 * 32];
7819
0
    Txfm2dFlipCfg cfg;
7820
0
    svt_aom_transform_config(transform_type, TX_32X32, &cfg);
7821
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7822
0
}
7823
7824
void svt_aom_transform_two_d_16x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7825
0
                                        uint8_t bit_depth) {
7826
0
    int32_t       intermediate_transform_buffer[16 * 16];
7827
0
    Txfm2dFlipCfg cfg;
7828
0
    svt_aom_transform_config(transform_type, TX_16X16, &cfg);
7829
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7830
0
}
7831
7832
void svt_aom_transform_two_d_8x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7833
0
                                      uint8_t bit_depth) {
7834
0
    int32_t       intermediate_transform_buffer[8 * 8];
7835
0
    Txfm2dFlipCfg cfg;
7836
0
    svt_aom_transform_config(transform_type, TX_8X8, &cfg);
7837
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7838
0
}
7839
7840
void svt_aom_transform_two_d_4x4_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7841
0
                                      uint8_t bit_depth) {
7842
0
    int32_t       intermediate_transform_buffer[4 * 4];
7843
0
    Txfm2dFlipCfg cfg;
7844
0
    svt_aom_transform_config(transform_type, TX_4X4, &cfg);
7845
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7846
0
}
7847
7848
void svt_av1_fwd_txfm2d_64x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7849
0
                                   uint8_t bit_depth) {
7850
0
    int32_t       intermediate_transform_buffer[64 * 32];
7851
0
    Txfm2dFlipCfg cfg;
7852
0
    svt_aom_transform_config(transform_type, TX_64X32, &cfg);
7853
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7854
0
}
7855
7856
void svt_av1_fwd_txfm2d_32x64_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7857
0
                                   uint8_t bit_depth) {
7858
0
    int32_t       intermediate_transform_buffer[32 * 64];
7859
0
    Txfm2dFlipCfg cfg;
7860
0
    svt_aom_transform_config(transform_type, TX_32X64, &cfg);
7861
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7862
0
}
7863
7864
void svt_av1_fwd_txfm2d_64x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7865
0
                                   uint8_t bit_depth) {
7866
0
    int32_t       intermediate_transform_buffer[64 * 16];
7867
0
    Txfm2dFlipCfg cfg;
7868
0
    svt_aom_transform_config(transform_type, TX_64X16, &cfg);
7869
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7870
0
}
7871
7872
void svt_av1_fwd_txfm2d_16x64_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7873
0
                                   uint8_t bit_depth) {
7874
0
    int32_t       intermediate_transform_buffer[16 * 64];
7875
0
    Txfm2dFlipCfg cfg;
7876
0
    svt_aom_transform_config(transform_type, TX_16X64, &cfg);
7877
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7878
0
}
7879
7880
void svt_av1_fwd_txfm2d_32x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7881
0
                                   uint8_t bit_depth) {
7882
0
    int32_t       intermediate_transform_buffer[32 * 16];
7883
0
    Txfm2dFlipCfg cfg;
7884
0
    svt_aom_transform_config(transform_type, TX_32X16, &cfg);
7885
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7886
0
}
7887
7888
void svt_av1_fwd_txfm2d_16x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7889
0
                                   uint8_t bit_depth) {
7890
0
    int32_t       intermediate_transform_buffer[16 * 32];
7891
0
    Txfm2dFlipCfg cfg;
7892
0
    svt_aom_transform_config(transform_type, TX_16X32, &cfg);
7893
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7894
0
}
7895
7896
void svt_av1_fwd_txfm2d_16x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7897
0
                                  uint8_t bit_depth) {
7898
0
    int32_t       intermediate_transform_buffer[16 * 8];
7899
0
    Txfm2dFlipCfg cfg;
7900
0
    svt_aom_transform_config(transform_type, TX_16X8, &cfg);
7901
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7902
0
}
7903
7904
void svt_av1_fwd_txfm2d_8x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7905
0
                                  uint8_t bit_depth) {
7906
0
    int32_t       intermediate_transform_buffer[8 * 16];
7907
0
    Txfm2dFlipCfg cfg;
7908
0
    svt_aom_transform_config(transform_type, TX_8X16, &cfg);
7909
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7910
0
}
7911
7912
void svt_av1_fwd_txfm2d_32x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7913
0
                                  uint8_t bit_depth) {
7914
0
    int32_t       intermediate_transform_buffer[32 * 8];
7915
0
    Txfm2dFlipCfg cfg;
7916
0
    svt_aom_transform_config(transform_type, TX_32X8, &cfg);
7917
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7918
0
}
7919
7920
void svt_av1_fwd_txfm2d_8x32_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7921
0
                                  uint8_t bit_depth) {
7922
0
    int32_t       intermediate_transform_buffer[8 * 32];
7923
0
    Txfm2dFlipCfg cfg;
7924
0
    svt_aom_transform_config(transform_type, TX_8X32, &cfg);
7925
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7926
0
}
7927
7928
void svt_av1_fwd_txfm2d_16x4_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7929
0
                                  uint8_t bit_depth) {
7930
0
    int32_t       intermediate_transform_buffer[16 * 4];
7931
0
    Txfm2dFlipCfg cfg;
7932
0
    svt_aom_transform_config(transform_type, TX_16X4, &cfg);
7933
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7934
0
}
7935
7936
void svt_av1_fwd_txfm2d_4x16_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7937
0
                                  uint8_t bit_depth) {
7938
0
    int32_t       intermediate_transform_buffer[4 * 16];
7939
0
    Txfm2dFlipCfg cfg;
7940
0
    svt_aom_transform_config(transform_type, TX_4X16, &cfg);
7941
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7942
0
}
7943
7944
void svt_av1_fwd_txfm2d_8x4_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7945
0
                                 uint8_t bit_depth) {
7946
0
    int32_t       intermediate_transform_buffer[8 * 4];
7947
0
    Txfm2dFlipCfg cfg;
7948
0
    svt_aom_transform_config(transform_type, TX_8X4, &cfg);
7949
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7950
0
}
7951
7952
void svt_av1_fwd_txfm2d_4x8_N4_c(int16_t* input, int32_t* output, uint32_t input_stride, TxType transform_type,
7953
0
                                 uint8_t bit_depth) {
7954
0
    int32_t       intermediate_transform_buffer[4 * 8];
7955
0
    Txfm2dFlipCfg cfg;
7956
0
    svt_aom_transform_config(transform_type, TX_4X8, &cfg);
7957
0
    av1_tranform_two_d_core_N4_c(input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7958
0
}