Coverage Report

Created: 2025-06-22 08:04

/src/aom/av1/encoder/av1_fwd_txfm1d.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <stdlib.h>
13
#include "av1/encoder/av1_fwd_txfm1d.h"
14
#include "av1/common/av1_txfm.h"
15
16
void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
17
0
               const int8_t *stage_range) {
18
0
  const int32_t size = 4;
19
0
  const int32_t *cospi;
20
21
0
  int32_t stage = 0;
22
0
  int32_t *bf0, *bf1;
23
0
  int32_t step[4];
24
25
  // stage 0;
26
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
27
28
  // stage 1;
29
0
  stage++;
30
0
  bf1 = output;
31
0
  bf1[0] = input[0] + input[3];
32
0
  bf1[1] = input[1] + input[2];
33
0
  bf1[2] = -input[2] + input[1];
34
0
  bf1[3] = -input[3] + input[0];
35
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36
37
  // stage 2
38
0
  stage++;
39
0
  cospi = cospi_arr(cos_bit);
40
0
  bf0 = output;
41
0
  bf1 = step;
42
0
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
43
0
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
44
0
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
45
0
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
46
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
47
48
  // stage 3
49
0
  stage++;
50
0
  bf0 = step;
51
0
  bf1 = output;
52
0
  bf1[0] = bf0[0];
53
0
  bf1[1] = bf0[2];
54
0
  bf1[2] = bf0[1];
55
0
  bf1[3] = bf0[3];
56
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
57
0
}
58
59
void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
60
0
               const int8_t *stage_range) {
61
0
  const int32_t size = 8;
62
0
  const int32_t *cospi;
63
64
0
  int32_t stage = 0;
65
0
  int32_t *bf0, *bf1;
66
0
  int32_t step[8];
67
68
  // stage 0;
69
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
70
71
  // stage 1;
72
0
  stage++;
73
0
  bf1 = output;
74
0
  bf1[0] = input[0] + input[7];
75
0
  bf1[1] = input[1] + input[6];
76
0
  bf1[2] = input[2] + input[5];
77
0
  bf1[3] = input[3] + input[4];
78
0
  bf1[4] = -input[4] + input[3];
79
0
  bf1[5] = -input[5] + input[2];
80
0
  bf1[6] = -input[6] + input[1];
81
0
  bf1[7] = -input[7] + input[0];
82
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
83
84
  // stage 2
85
0
  stage++;
86
0
  cospi = cospi_arr(cos_bit);
87
0
  bf0 = output;
88
0
  bf1 = step;
89
0
  bf1[0] = bf0[0] + bf0[3];
90
0
  bf1[1] = bf0[1] + bf0[2];
91
0
  bf1[2] = -bf0[2] + bf0[1];
92
0
  bf1[3] = -bf0[3] + bf0[0];
93
0
  bf1[4] = bf0[4];
94
0
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
95
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
96
0
  bf1[7] = bf0[7];
97
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
98
99
  // stage 3
100
0
  stage++;
101
0
  cospi = cospi_arr(cos_bit);
102
0
  bf0 = step;
103
0
  bf1 = output;
104
0
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
105
0
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
106
0
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
107
0
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
108
0
  bf1[4] = bf0[4] + bf0[5];
109
0
  bf1[5] = -bf0[5] + bf0[4];
110
0
  bf1[6] = -bf0[6] + bf0[7];
111
0
  bf1[7] = bf0[7] + bf0[6];
112
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
113
114
  // stage 4
115
0
  stage++;
116
0
  cospi = cospi_arr(cos_bit);
117
0
  bf0 = output;
118
0
  bf1 = step;
119
0
  bf1[0] = bf0[0];
120
0
  bf1[1] = bf0[1];
121
0
  bf1[2] = bf0[2];
122
0
  bf1[3] = bf0[3];
123
0
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
124
0
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
125
0
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
126
0
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
127
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
128
129
  // stage 5
130
0
  stage++;
131
0
  bf0 = step;
132
0
  bf1 = output;
133
0
  bf1[0] = bf0[0];
134
0
  bf1[1] = bf0[4];
135
0
  bf1[2] = bf0[2];
136
0
  bf1[3] = bf0[6];
137
0
  bf1[4] = bf0[1];
138
0
  bf1[5] = bf0[5];
139
0
  bf1[6] = bf0[3];
140
0
  bf1[7] = bf0[7];
141
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
142
0
}
143
144
void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
145
0
                const int8_t *stage_range) {
146
0
  const int32_t size = 16;
147
0
  const int32_t *cospi;
148
149
0
  int32_t stage = 0;
150
0
  int32_t *bf0, *bf1;
151
0
  int32_t step[16];
152
153
  // stage 0;
154
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
155
156
  // stage 1;
157
0
  stage++;
158
0
  bf1 = output;
159
0
  bf1[0] = input[0] + input[15];
160
0
  bf1[1] = input[1] + input[14];
161
0
  bf1[2] = input[2] + input[13];
162
0
  bf1[3] = input[3] + input[12];
163
0
  bf1[4] = input[4] + input[11];
164
0
  bf1[5] = input[5] + input[10];
165
0
  bf1[6] = input[6] + input[9];
166
0
  bf1[7] = input[7] + input[8];
167
0
  bf1[8] = -input[8] + input[7];
168
0
  bf1[9] = -input[9] + input[6];
169
0
  bf1[10] = -input[10] + input[5];
170
0
  bf1[11] = -input[11] + input[4];
171
0
  bf1[12] = -input[12] + input[3];
172
0
  bf1[13] = -input[13] + input[2];
173
0
  bf1[14] = -input[14] + input[1];
174
0
  bf1[15] = -input[15] + input[0];
175
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
176
177
  // stage 2
178
0
  stage++;
179
0
  cospi = cospi_arr(cos_bit);
180
0
  bf0 = output;
181
0
  bf1 = step;
182
0
  bf1[0] = bf0[0] + bf0[7];
183
0
  bf1[1] = bf0[1] + bf0[6];
184
0
  bf1[2] = bf0[2] + bf0[5];
185
0
  bf1[3] = bf0[3] + bf0[4];
186
0
  bf1[4] = -bf0[4] + bf0[3];
187
0
  bf1[5] = -bf0[5] + bf0[2];
188
0
  bf1[6] = -bf0[6] + bf0[1];
189
0
  bf1[7] = -bf0[7] + bf0[0];
190
0
  bf1[8] = bf0[8];
191
0
  bf1[9] = bf0[9];
192
0
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
193
0
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
194
0
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
195
0
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
196
0
  bf1[14] = bf0[14];
197
0
  bf1[15] = bf0[15];
198
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
199
200
  // stage 3
201
0
  stage++;
202
0
  cospi = cospi_arr(cos_bit);
203
0
  bf0 = step;
204
0
  bf1 = output;
205
0
  bf1[0] = bf0[0] + bf0[3];
206
0
  bf1[1] = bf0[1] + bf0[2];
207
0
  bf1[2] = -bf0[2] + bf0[1];
208
0
  bf1[3] = -bf0[3] + bf0[0];
209
0
  bf1[4] = bf0[4];
210
0
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
211
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
212
0
  bf1[7] = bf0[7];
213
0
  bf1[8] = bf0[8] + bf0[11];
214
0
  bf1[9] = bf0[9] + bf0[10];
215
0
  bf1[10] = -bf0[10] + bf0[9];
216
0
  bf1[11] = -bf0[11] + bf0[8];
217
0
  bf1[12] = -bf0[12] + bf0[15];
218
0
  bf1[13] = -bf0[13] + bf0[14];
219
0
  bf1[14] = bf0[14] + bf0[13];
220
0
  bf1[15] = bf0[15] + bf0[12];
221
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
222
223
  // stage 4
224
0
  stage++;
225
0
  cospi = cospi_arr(cos_bit);
226
0
  bf0 = output;
227
0
  bf1 = step;
228
0
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
229
0
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
230
0
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
231
0
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
232
0
  bf1[4] = bf0[4] + bf0[5];
233
0
  bf1[5] = -bf0[5] + bf0[4];
234
0
  bf1[6] = -bf0[6] + bf0[7];
235
0
  bf1[7] = bf0[7] + bf0[6];
236
0
  bf1[8] = bf0[8];
237
0
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
238
0
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
239
0
  bf1[11] = bf0[11];
240
0
  bf1[12] = bf0[12];
241
0
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
242
0
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
243
0
  bf1[15] = bf0[15];
244
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
245
246
  // stage 5
247
0
  stage++;
248
0
  cospi = cospi_arr(cos_bit);
249
0
  bf0 = step;
250
0
  bf1 = output;
251
0
  bf1[0] = bf0[0];
252
0
  bf1[1] = bf0[1];
253
0
  bf1[2] = bf0[2];
254
0
  bf1[3] = bf0[3];
255
0
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
256
0
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
257
0
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
258
0
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
259
0
  bf1[8] = bf0[8] + bf0[9];
260
0
  bf1[9] = -bf0[9] + bf0[8];
261
0
  bf1[10] = -bf0[10] + bf0[11];
262
0
  bf1[11] = bf0[11] + bf0[10];
263
0
  bf1[12] = bf0[12] + bf0[13];
264
0
  bf1[13] = -bf0[13] + bf0[12];
265
0
  bf1[14] = -bf0[14] + bf0[15];
266
0
  bf1[15] = bf0[15] + bf0[14];
267
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
268
269
  // stage 6
270
0
  stage++;
271
0
  cospi = cospi_arr(cos_bit);
272
0
  bf0 = output;
273
0
  bf1 = step;
274
0
  bf1[0] = bf0[0];
275
0
  bf1[1] = bf0[1];
276
0
  bf1[2] = bf0[2];
277
0
  bf1[3] = bf0[3];
278
0
  bf1[4] = bf0[4];
279
0
  bf1[5] = bf0[5];
280
0
  bf1[6] = bf0[6];
281
0
  bf1[7] = bf0[7];
282
0
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
283
0
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
284
0
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
285
0
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
286
0
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
287
0
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
288
0
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
289
0
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
290
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
291
292
  // stage 7
293
0
  stage++;
294
0
  bf0 = step;
295
0
  bf1 = output;
296
0
  bf1[0] = bf0[0];
297
0
  bf1[1] = bf0[8];
298
0
  bf1[2] = bf0[4];
299
0
  bf1[3] = bf0[12];
300
0
  bf1[4] = bf0[2];
301
0
  bf1[5] = bf0[10];
302
0
  bf1[6] = bf0[6];
303
0
  bf1[7] = bf0[14];
304
0
  bf1[8] = bf0[1];
305
0
  bf1[9] = bf0[9];
306
0
  bf1[10] = bf0[5];
307
0
  bf1[11] = bf0[13];
308
0
  bf1[12] = bf0[3];
309
0
  bf1[13] = bf0[11];
310
0
  bf1[14] = bf0[7];
311
0
  bf1[15] = bf0[15];
312
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
313
0
}
314
315
void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
316
0
                const int8_t *stage_range) {
317
0
  const int32_t size = 32;
318
0
  const int32_t *cospi;
319
320
0
  int32_t stage = 0;
321
0
  int32_t *bf0, *bf1;
322
0
  int32_t step[32];
323
324
  // stage 0;
325
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
326
327
  // stage 1;
328
0
  stage++;
329
0
  bf1 = output;
330
0
  bf1[0] = input[0] + input[31];
331
0
  bf1[1] = input[1] + input[30];
332
0
  bf1[2] = input[2] + input[29];
333
0
  bf1[3] = input[3] + input[28];
334
0
  bf1[4] = input[4] + input[27];
335
0
  bf1[5] = input[5] + input[26];
336
0
  bf1[6] = input[6] + input[25];
337
0
  bf1[7] = input[7] + input[24];
338
0
  bf1[8] = input[8] + input[23];
339
0
  bf1[9] = input[9] + input[22];
340
0
  bf1[10] = input[10] + input[21];
341
0
  bf1[11] = input[11] + input[20];
342
0
  bf1[12] = input[12] + input[19];
343
0
  bf1[13] = input[13] + input[18];
344
0
  bf1[14] = input[14] + input[17];
345
0
  bf1[15] = input[15] + input[16];
346
0
  bf1[16] = -input[16] + input[15];
347
0
  bf1[17] = -input[17] + input[14];
348
0
  bf1[18] = -input[18] + input[13];
349
0
  bf1[19] = -input[19] + input[12];
350
0
  bf1[20] = -input[20] + input[11];
351
0
  bf1[21] = -input[21] + input[10];
352
0
  bf1[22] = -input[22] + input[9];
353
0
  bf1[23] = -input[23] + input[8];
354
0
  bf1[24] = -input[24] + input[7];
355
0
  bf1[25] = -input[25] + input[6];
356
0
  bf1[26] = -input[26] + input[5];
357
0
  bf1[27] = -input[27] + input[4];
358
0
  bf1[28] = -input[28] + input[3];
359
0
  bf1[29] = -input[29] + input[2];
360
0
  bf1[30] = -input[30] + input[1];
361
0
  bf1[31] = -input[31] + input[0];
362
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
363
364
  // stage 2
365
0
  stage++;
366
0
  cospi = cospi_arr(cos_bit);
367
0
  bf0 = output;
368
0
  bf1 = step;
369
0
  bf1[0] = bf0[0] + bf0[15];
370
0
  bf1[1] = bf0[1] + bf0[14];
371
0
  bf1[2] = bf0[2] + bf0[13];
372
0
  bf1[3] = bf0[3] + bf0[12];
373
0
  bf1[4] = bf0[4] + bf0[11];
374
0
  bf1[5] = bf0[5] + bf0[10];
375
0
  bf1[6] = bf0[6] + bf0[9];
376
0
  bf1[7] = bf0[7] + bf0[8];
377
0
  bf1[8] = -bf0[8] + bf0[7];
378
0
  bf1[9] = -bf0[9] + bf0[6];
379
0
  bf1[10] = -bf0[10] + bf0[5];
380
0
  bf1[11] = -bf0[11] + bf0[4];
381
0
  bf1[12] = -bf0[12] + bf0[3];
382
0
  bf1[13] = -bf0[13] + bf0[2];
383
0
  bf1[14] = -bf0[14] + bf0[1];
384
0
  bf1[15] = -bf0[15] + bf0[0];
385
0
  bf1[16] = bf0[16];
386
0
  bf1[17] = bf0[17];
387
0
  bf1[18] = bf0[18];
388
0
  bf1[19] = bf0[19];
389
0
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
390
0
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
391
0
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
392
0
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
393
0
  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
394
0
  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
395
0
  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
396
0
  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
397
0
  bf1[28] = bf0[28];
398
0
  bf1[29] = bf0[29];
399
0
  bf1[30] = bf0[30];
400
0
  bf1[31] = bf0[31];
401
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
402
403
  // stage 3
404
0
  stage++;
405
0
  cospi = cospi_arr(cos_bit);
406
0
  bf0 = step;
407
0
  bf1 = output;
408
0
  bf1[0] = bf0[0] + bf0[7];
409
0
  bf1[1] = bf0[1] + bf0[6];
410
0
  bf1[2] = bf0[2] + bf0[5];
411
0
  bf1[3] = bf0[3] + bf0[4];
412
0
  bf1[4] = -bf0[4] + bf0[3];
413
0
  bf1[5] = -bf0[5] + bf0[2];
414
0
  bf1[6] = -bf0[6] + bf0[1];
415
0
  bf1[7] = -bf0[7] + bf0[0];
416
0
  bf1[8] = bf0[8];
417
0
  bf1[9] = bf0[9];
418
0
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
419
0
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
420
0
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
421
0
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
422
0
  bf1[14] = bf0[14];
423
0
  bf1[15] = bf0[15];
424
0
  bf1[16] = bf0[16] + bf0[23];
425
0
  bf1[17] = bf0[17] + bf0[22];
426
0
  bf1[18] = bf0[18] + bf0[21];
427
0
  bf1[19] = bf0[19] + bf0[20];
428
0
  bf1[20] = -bf0[20] + bf0[19];
429
0
  bf1[21] = -bf0[21] + bf0[18];
430
0
  bf1[22] = -bf0[22] + bf0[17];
431
0
  bf1[23] = -bf0[23] + bf0[16];
432
0
  bf1[24] = -bf0[24] + bf0[31];
433
0
  bf1[25] = -bf0[25] + bf0[30];
434
0
  bf1[26] = -bf0[26] + bf0[29];
435
0
  bf1[27] = -bf0[27] + bf0[28];
436
0
  bf1[28] = bf0[28] + bf0[27];
437
0
  bf1[29] = bf0[29] + bf0[26];
438
0
  bf1[30] = bf0[30] + bf0[25];
439
0
  bf1[31] = bf0[31] + bf0[24];
440
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
441
442
  // stage 4
443
0
  stage++;
444
0
  cospi = cospi_arr(cos_bit);
445
0
  bf0 = output;
446
0
  bf1 = step;
447
0
  bf1[0] = bf0[0] + bf0[3];
448
0
  bf1[1] = bf0[1] + bf0[2];
449
0
  bf1[2] = -bf0[2] + bf0[1];
450
0
  bf1[3] = -bf0[3] + bf0[0];
451
0
  bf1[4] = bf0[4];
452
0
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
453
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
454
0
  bf1[7] = bf0[7];
455
0
  bf1[8] = bf0[8] + bf0[11];
456
0
  bf1[9] = bf0[9] + bf0[10];
457
0
  bf1[10] = -bf0[10] + bf0[9];
458
0
  bf1[11] = -bf0[11] + bf0[8];
459
0
  bf1[12] = -bf0[12] + bf0[15];
460
0
  bf1[13] = -bf0[13] + bf0[14];
461
0
  bf1[14] = bf0[14] + bf0[13];
462
0
  bf1[15] = bf0[15] + bf0[12];
463
0
  bf1[16] = bf0[16];
464
0
  bf1[17] = bf0[17];
465
0
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
466
0
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
467
0
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
468
0
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
469
0
  bf1[22] = bf0[22];
470
0
  bf1[23] = bf0[23];
471
0
  bf1[24] = bf0[24];
472
0
  bf1[25] = bf0[25];
473
0
  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
474
0
  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
475
0
  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
476
0
  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
477
0
  bf1[30] = bf0[30];
478
0
  bf1[31] = bf0[31];
479
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
480
481
  // stage 5
482
0
  stage++;
483
0
  cospi = cospi_arr(cos_bit);
484
0
  bf0 = step;
485
0
  bf1 = output;
486
0
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
487
0
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
488
0
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
489
0
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
490
0
  bf1[4] = bf0[4] + bf0[5];
491
0
  bf1[5] = -bf0[5] + bf0[4];
492
0
  bf1[6] = -bf0[6] + bf0[7];
493
0
  bf1[7] = bf0[7] + bf0[6];
494
0
  bf1[8] = bf0[8];
495
0
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
496
0
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
497
0
  bf1[11] = bf0[11];
498
0
  bf1[12] = bf0[12];
499
0
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
500
0
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
501
0
  bf1[15] = bf0[15];
502
0
  bf1[16] = bf0[16] + bf0[19];
503
0
  bf1[17] = bf0[17] + bf0[18];
504
0
  bf1[18] = -bf0[18] + bf0[17];
505
0
  bf1[19] = -bf0[19] + bf0[16];
506
0
  bf1[20] = -bf0[20] + bf0[23];
507
0
  bf1[21] = -bf0[21] + bf0[22];
508
0
  bf1[22] = bf0[22] + bf0[21];
509
0
  bf1[23] = bf0[23] + bf0[20];
510
0
  bf1[24] = bf0[24] + bf0[27];
511
0
  bf1[25] = bf0[25] + bf0[26];
512
0
  bf1[26] = -bf0[26] + bf0[25];
513
0
  bf1[27] = -bf0[27] + bf0[24];
514
0
  bf1[28] = -bf0[28] + bf0[31];
515
0
  bf1[29] = -bf0[29] + bf0[30];
516
0
  bf1[30] = bf0[30] + bf0[29];
517
0
  bf1[31] = bf0[31] + bf0[28];
518
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
519
520
  // stage 6
521
0
  stage++;
522
0
  cospi = cospi_arr(cos_bit);
523
0
  bf0 = output;
524
0
  bf1 = step;
525
0
  bf1[0] = bf0[0];
526
0
  bf1[1] = bf0[1];
527
0
  bf1[2] = bf0[2];
528
0
  bf1[3] = bf0[3];
529
0
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
530
0
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
531
0
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
532
0
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
533
0
  bf1[8] = bf0[8] + bf0[9];
534
0
  bf1[9] = -bf0[9] + bf0[8];
535
0
  bf1[10] = -bf0[10] + bf0[11];
536
0
  bf1[11] = bf0[11] + bf0[10];
537
0
  bf1[12] = bf0[12] + bf0[13];
538
0
  bf1[13] = -bf0[13] + bf0[12];
539
0
  bf1[14] = -bf0[14] + bf0[15];
540
0
  bf1[15] = bf0[15] + bf0[14];
541
0
  bf1[16] = bf0[16];
542
0
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
543
0
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
544
0
  bf1[19] = bf0[19];
545
0
  bf1[20] = bf0[20];
546
0
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
547
0
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
548
0
  bf1[23] = bf0[23];
549
0
  bf1[24] = bf0[24];
550
0
  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
551
0
  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
552
0
  bf1[27] = bf0[27];
553
0
  bf1[28] = bf0[28];
554
0
  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
555
0
  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
556
0
  bf1[31] = bf0[31];
557
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
558
559
  // stage 7
560
0
  stage++;
561
0
  cospi = cospi_arr(cos_bit);
562
0
  bf0 = step;
563
0
  bf1 = output;
564
0
  bf1[0] = bf0[0];
565
0
  bf1[1] = bf0[1];
566
0
  bf1[2] = bf0[2];
567
0
  bf1[3] = bf0[3];
568
0
  bf1[4] = bf0[4];
569
0
  bf1[5] = bf0[5];
570
0
  bf1[6] = bf0[6];
571
0
  bf1[7] = bf0[7];
572
0
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
573
0
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
574
0
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
575
0
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
576
0
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
577
0
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
578
0
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
579
0
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
580
0
  bf1[16] = bf0[16] + bf0[17];
581
0
  bf1[17] = -bf0[17] + bf0[16];
582
0
  bf1[18] = -bf0[18] + bf0[19];
583
0
  bf1[19] = bf0[19] + bf0[18];
584
0
  bf1[20] = bf0[20] + bf0[21];
585
0
  bf1[21] = -bf0[21] + bf0[20];
586
0
  bf1[22] = -bf0[22] + bf0[23];
587
0
  bf1[23] = bf0[23] + bf0[22];
588
0
  bf1[24] = bf0[24] + bf0[25];
589
0
  bf1[25] = -bf0[25] + bf0[24];
590
0
  bf1[26] = -bf0[26] + bf0[27];
591
0
  bf1[27] = bf0[27] + bf0[26];
592
0
  bf1[28] = bf0[28] + bf0[29];
593
0
  bf1[29] = -bf0[29] + bf0[28];
594
0
  bf1[30] = -bf0[30] + bf0[31];
595
0
  bf1[31] = bf0[31] + bf0[30];
596
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
597
598
  // stage 8
599
0
  stage++;
600
0
  cospi = cospi_arr(cos_bit);
601
0
  bf0 = output;
602
0
  bf1 = step;
603
0
  bf1[0] = bf0[0];
604
0
  bf1[1] = bf0[1];
605
0
  bf1[2] = bf0[2];
606
0
  bf1[3] = bf0[3];
607
0
  bf1[4] = bf0[4];
608
0
  bf1[5] = bf0[5];
609
0
  bf1[6] = bf0[6];
610
0
  bf1[7] = bf0[7];
611
0
  bf1[8] = bf0[8];
612
0
  bf1[9] = bf0[9];
613
0
  bf1[10] = bf0[10];
614
0
  bf1[11] = bf0[11];
615
0
  bf1[12] = bf0[12];
616
0
  bf1[13] = bf0[13];
617
0
  bf1[14] = bf0[14];
618
0
  bf1[15] = bf0[15];
619
0
  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
620
0
  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
621
0
  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
622
0
  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
623
0
  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
624
0
  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
625
0
  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
626
0
  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
627
0
  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
628
0
  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
629
0
  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
630
0
  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
631
0
  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
632
0
  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
633
0
  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
634
0
  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
635
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
636
637
  // stage 9
638
0
  stage++;
639
0
  bf0 = step;
640
0
  bf1 = output;
641
0
  bf1[0] = bf0[0];
642
0
  bf1[1] = bf0[16];
643
0
  bf1[2] = bf0[8];
644
0
  bf1[3] = bf0[24];
645
0
  bf1[4] = bf0[4];
646
0
  bf1[5] = bf0[20];
647
0
  bf1[6] = bf0[12];
648
0
  bf1[7] = bf0[28];
649
0
  bf1[8] = bf0[2];
650
0
  bf1[9] = bf0[18];
651
0
  bf1[10] = bf0[10];
652
0
  bf1[11] = bf0[26];
653
0
  bf1[12] = bf0[6];
654
0
  bf1[13] = bf0[22];
655
0
  bf1[14] = bf0[14];
656
0
  bf1[15] = bf0[30];
657
0
  bf1[16] = bf0[1];
658
0
  bf1[17] = bf0[17];
659
0
  bf1[18] = bf0[9];
660
0
  bf1[19] = bf0[25];
661
0
  bf1[20] = bf0[5];
662
0
  bf1[21] = bf0[21];
663
0
  bf1[22] = bf0[13];
664
0
  bf1[23] = bf0[29];
665
0
  bf1[24] = bf0[3];
666
0
  bf1[25] = bf0[19];
667
0
  bf1[26] = bf0[11];
668
0
  bf1[27] = bf0[27];
669
0
  bf1[28] = bf0[7];
670
0
  bf1[29] = bf0[23];
671
0
  bf1[30] = bf0[15];
672
0
  bf1[31] = bf0[31];
673
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
674
0
}
675
676
void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
677
0
                const int8_t *stage_range) {
678
0
  int bit = cos_bit;
679
0
  const int32_t *sinpi = sinpi_arr(bit);
680
0
  int32_t x0, x1, x2, x3;
681
0
  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
682
683
  // stage 0
684
0
  av1_range_check_buf(0, input, input, 4, stage_range[0]);
685
0
  x0 = input[0];
686
0
  x1 = input[1];
687
0
  x2 = input[2];
688
0
  x3 = input[3];
689
690
0
  if (!(x0 | x1 | x2 | x3)) {
691
0
    output[0] = output[1] = output[2] = output[3] = 0;
692
0
    return;
693
0
  }
694
695
  // stage 1
696
0
  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
697
0
  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
698
0
  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
699
0
  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
700
0
  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
701
0
  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
702
0
  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
703
0
  s7 = range_check_value(x0 + x1, stage_range[1]);
704
705
  // stage 2
706
0
  s7 = range_check_value(s7 - x3, stage_range[2]);
707
708
  // stage 3
709
0
  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
710
0
  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
711
0
  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
712
0
  x3 = range_check_value(s4, bit + stage_range[3]);
713
714
  // stage 4
715
0
  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
716
0
  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
717
718
  // stage 5
719
0
  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
720
0
  s1 = range_check_value(x1, bit + stage_range[5]);
721
0
  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
722
0
  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
723
724
  // stage 6
725
0
  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
726
727
  // 1-D transform scaling factor is sqrt(2).
728
0
  output[0] = round_shift(s0, bit);
729
0
  output[1] = round_shift(s1, bit);
730
0
  output[2] = round_shift(s2, bit);
731
0
  output[3] = round_shift(s3, bit);
732
0
  av1_range_check_buf(6, input, output, 4, stage_range[6]);
733
0
}
734
735
void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
736
0
                const int8_t *stage_range) {
737
0
  const int32_t size = 8;
738
0
  const int32_t *cospi;
739
740
0
  int32_t stage = 0;
741
0
  int32_t *bf0, *bf1;
742
0
  int32_t step[8];
743
744
  // stage 0;
745
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
746
747
  // stage 1;
748
0
  stage++;
749
0
  assert(output != input);
750
0
  bf1 = output;
751
0
  bf1[0] = input[0];
752
0
  bf1[1] = -input[7];
753
0
  bf1[2] = -input[3];
754
0
  bf1[3] = input[4];
755
0
  bf1[4] = -input[1];
756
0
  bf1[5] = input[6];
757
0
  bf1[6] = input[2];
758
0
  bf1[7] = -input[5];
759
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
760
761
  // stage 2
762
0
  stage++;
763
0
  cospi = cospi_arr(cos_bit);
764
0
  bf0 = output;
765
0
  bf1 = step;
766
0
  bf1[0] = bf0[0];
767
0
  bf1[1] = bf0[1];
768
0
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
769
0
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
770
0
  bf1[4] = bf0[4];
771
0
  bf1[5] = bf0[5];
772
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
773
0
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
774
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
775
776
  // stage 3
777
0
  stage++;
778
0
  bf0 = step;
779
0
  bf1 = output;
780
0
  bf1[0] = bf0[0] + bf0[2];
781
0
  bf1[1] = bf0[1] + bf0[3];
782
0
  bf1[2] = bf0[0] - bf0[2];
783
0
  bf1[3] = bf0[1] - bf0[3];
784
0
  bf1[4] = bf0[4] + bf0[6];
785
0
  bf1[5] = bf0[5] + bf0[7];
786
0
  bf1[6] = bf0[4] - bf0[6];
787
0
  bf1[7] = bf0[5] - bf0[7];
788
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
789
790
  // stage 4
791
0
  stage++;
792
0
  cospi = cospi_arr(cos_bit);
793
0
  bf0 = output;
794
0
  bf1 = step;
795
0
  bf1[0] = bf0[0];
796
0
  bf1[1] = bf0[1];
797
0
  bf1[2] = bf0[2];
798
0
  bf1[3] = bf0[3];
799
0
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
800
0
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
801
0
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
802
0
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
803
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
804
805
  // stage 5
806
0
  stage++;
807
0
  bf0 = step;
808
0
  bf1 = output;
809
0
  bf1[0] = bf0[0] + bf0[4];
810
0
  bf1[1] = bf0[1] + bf0[5];
811
0
  bf1[2] = bf0[2] + bf0[6];
812
0
  bf1[3] = bf0[3] + bf0[7];
813
0
  bf1[4] = bf0[0] - bf0[4];
814
0
  bf1[5] = bf0[1] - bf0[5];
815
0
  bf1[6] = bf0[2] - bf0[6];
816
0
  bf1[7] = bf0[3] - bf0[7];
817
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
818
819
  // stage 6
820
0
  stage++;
821
0
  cospi = cospi_arr(cos_bit);
822
0
  bf0 = output;
823
0
  bf1 = step;
824
0
  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
825
0
  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
826
0
  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
827
0
  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
828
0
  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
829
0
  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
830
0
  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
831
0
  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
832
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
833
834
  // stage 7
835
0
  stage++;
836
0
  bf0 = step;
837
0
  bf1 = output;
838
0
  bf1[0] = bf0[1];
839
0
  bf1[1] = bf0[6];
840
0
  bf1[2] = bf0[3];
841
0
  bf1[3] = bf0[4];
842
0
  bf1[4] = bf0[5];
843
0
  bf1[5] = bf0[2];
844
0
  bf1[6] = bf0[7];
845
0
  bf1[7] = bf0[0];
846
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
847
0
}
848
849
void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
850
0
                 const int8_t *stage_range) {
851
0
  const int32_t size = 16;
852
0
  const int32_t *cospi;
853
854
0
  int32_t stage = 0;
855
0
  int32_t *bf0, *bf1;
856
0
  int32_t step[16];
857
858
  // stage 0;
859
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
860
861
  // stage 1;
862
0
  stage++;
863
0
  assert(output != input);
864
0
  bf1 = output;
865
0
  bf1[0] = input[0];
866
0
  bf1[1] = -input[15];
867
0
  bf1[2] = -input[7];
868
0
  bf1[3] = input[8];
869
0
  bf1[4] = -input[3];
870
0
  bf1[5] = input[12];
871
0
  bf1[6] = input[4];
872
0
  bf1[7] = -input[11];
873
0
  bf1[8] = -input[1];
874
0
  bf1[9] = input[14];
875
0
  bf1[10] = input[6];
876
0
  bf1[11] = -input[9];
877
0
  bf1[12] = input[2];
878
0
  bf1[13] = -input[13];
879
0
  bf1[14] = -input[5];
880
0
  bf1[15] = input[10];
881
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
882
883
  // stage 2
884
0
  stage++;
885
0
  cospi = cospi_arr(cos_bit);
886
0
  bf0 = output;
887
0
  bf1 = step;
888
0
  bf1[0] = bf0[0];
889
0
  bf1[1] = bf0[1];
890
0
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
891
0
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
892
0
  bf1[4] = bf0[4];
893
0
  bf1[5] = bf0[5];
894
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
895
0
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
896
0
  bf1[8] = bf0[8];
897
0
  bf1[9] = bf0[9];
898
0
  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
899
0
  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
900
0
  bf1[12] = bf0[12];
901
0
  bf1[13] = bf0[13];
902
0
  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
903
0
  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
904
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
905
906
  // stage 3
907
0
  stage++;
908
0
  bf0 = step;
909
0
  bf1 = output;
910
0
  bf1[0] = bf0[0] + bf0[2];
911
0
  bf1[1] = bf0[1] + bf0[3];
912
0
  bf1[2] = bf0[0] - bf0[2];
913
0
  bf1[3] = bf0[1] - bf0[3];
914
0
  bf1[4] = bf0[4] + bf0[6];
915
0
  bf1[5] = bf0[5] + bf0[7];
916
0
  bf1[6] = bf0[4] - bf0[6];
917
0
  bf1[7] = bf0[5] - bf0[7];
918
0
  bf1[8] = bf0[8] + bf0[10];
919
0
  bf1[9] = bf0[9] + bf0[11];
920
0
  bf1[10] = bf0[8] - bf0[10];
921
0
  bf1[11] = bf0[9] - bf0[11];
922
0
  bf1[12] = bf0[12] + bf0[14];
923
0
  bf1[13] = bf0[13] + bf0[15];
924
0
  bf1[14] = bf0[12] - bf0[14];
925
0
  bf1[15] = bf0[13] - bf0[15];
926
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
927
928
  // stage 4
929
0
  stage++;
930
0
  cospi = cospi_arr(cos_bit);
931
0
  bf0 = output;
932
0
  bf1 = step;
933
0
  bf1[0] = bf0[0];
934
0
  bf1[1] = bf0[1];
935
0
  bf1[2] = bf0[2];
936
0
  bf1[3] = bf0[3];
937
0
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
938
0
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
939
0
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
940
0
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
941
0
  bf1[8] = bf0[8];
942
0
  bf1[9] = bf0[9];
943
0
  bf1[10] = bf0[10];
944
0
  bf1[11] = bf0[11];
945
0
  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
946
0
  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
947
0
  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
948
0
  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
949
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
950
951
  // stage 5
952
0
  stage++;
953
0
  bf0 = step;
954
0
  bf1 = output;
955
0
  bf1[0] = bf0[0] + bf0[4];
956
0
  bf1[1] = bf0[1] + bf0[5];
957
0
  bf1[2] = bf0[2] + bf0[6];
958
0
  bf1[3] = bf0[3] + bf0[7];
959
0
  bf1[4] = bf0[0] - bf0[4];
960
0
  bf1[5] = bf0[1] - bf0[5];
961
0
  bf1[6] = bf0[2] - bf0[6];
962
0
  bf1[7] = bf0[3] - bf0[7];
963
0
  bf1[8] = bf0[8] + bf0[12];
964
0
  bf1[9] = bf0[9] + bf0[13];
965
0
  bf1[10] = bf0[10] + bf0[14];
966
0
  bf1[11] = bf0[11] + bf0[15];
967
0
  bf1[12] = bf0[8] - bf0[12];
968
0
  bf1[13] = bf0[9] - bf0[13];
969
0
  bf1[14] = bf0[10] - bf0[14];
970
0
  bf1[15] = bf0[11] - bf0[15];
971
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
972
973
  // stage 6
974
0
  stage++;
975
0
  cospi = cospi_arr(cos_bit);
976
0
  bf0 = output;
977
0
  bf1 = step;
978
0
  bf1[0] = bf0[0];
979
0
  bf1[1] = bf0[1];
980
0
  bf1[2] = bf0[2];
981
0
  bf1[3] = bf0[3];
982
0
  bf1[4] = bf0[4];
983
0
  bf1[5] = bf0[5];
984
0
  bf1[6] = bf0[6];
985
0
  bf1[7] = bf0[7];
986
0
  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
987
0
  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
988
0
  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
989
0
  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
990
0
  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
991
0
  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
992
0
  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
993
0
  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
994
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
995
996
  // stage 7
997
0
  stage++;
998
0
  bf0 = step;
999
0
  bf1 = output;
1000
0
  bf1[0] = bf0[0] + bf0[8];
1001
0
  bf1[1] = bf0[1] + bf0[9];
1002
0
  bf1[2] = bf0[2] + bf0[10];
1003
0
  bf1[3] = bf0[3] + bf0[11];
1004
0
  bf1[4] = bf0[4] + bf0[12];
1005
0
  bf1[5] = bf0[5] + bf0[13];
1006
0
  bf1[6] = bf0[6] + bf0[14];
1007
0
  bf1[7] = bf0[7] + bf0[15];
1008
0
  bf1[8] = bf0[0] - bf0[8];
1009
0
  bf1[9] = bf0[1] - bf0[9];
1010
0
  bf1[10] = bf0[2] - bf0[10];
1011
0
  bf1[11] = bf0[3] - bf0[11];
1012
0
  bf1[12] = bf0[4] - bf0[12];
1013
0
  bf1[13] = bf0[5] - bf0[13];
1014
0
  bf1[14] = bf0[6] - bf0[14];
1015
0
  bf1[15] = bf0[7] - bf0[15];
1016
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1017
1018
  // stage 8
1019
0
  stage++;
1020
0
  cospi = cospi_arr(cos_bit);
1021
0
  bf0 = output;
1022
0
  bf1 = step;
1023
0
  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1024
0
  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1025
0
  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1026
0
  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1027
0
  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1028
0
  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1029
0
  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1030
0
  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1031
0
  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1032
0
  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1033
0
  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1034
0
  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1035
0
  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1036
0
  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1037
0
  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1038
0
  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1039
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1040
1041
  // stage 9
1042
0
  stage++;
1043
0
  bf0 = step;
1044
0
  bf1 = output;
1045
0
  bf1[0] = bf0[1];
1046
0
  bf1[1] = bf0[14];
1047
0
  bf1[2] = bf0[3];
1048
0
  bf1[3] = bf0[12];
1049
0
  bf1[4] = bf0[5];
1050
0
  bf1[5] = bf0[10];
1051
0
  bf1[6] = bf0[7];
1052
0
  bf1[7] = bf0[8];
1053
0
  bf1[8] = bf0[9];
1054
0
  bf1[9] = bf0[6];
1055
0
  bf1[10] = bf0[11];
1056
0
  bf1[11] = bf0[4];
1057
0
  bf1[12] = bf0[13];
1058
0
  bf1[13] = bf0[2];
1059
0
  bf1[14] = bf0[15];
1060
0
  bf1[15] = bf0[0];
1061
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1062
0
}
1063
1064
void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1065
0
                      const int8_t *stage_range) {
1066
0
  (void)cos_bit;
1067
0
  for (int i = 0; i < 4; ++i)
1068
0
    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
1069
0
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1070
0
  av1_range_check_buf(0, input, output, 4, stage_range[0]);
1071
0
}
1072
1073
void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1074
0
                      const int8_t *stage_range) {
1075
0
  (void)cos_bit;
1076
0
  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1077
0
  av1_range_check_buf(0, input, output, 8, stage_range[0]);
1078
0
}
1079
1080
void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1081
0
                       const int8_t *stage_range) {
1082
0
  (void)cos_bit;
1083
0
  for (int i = 0; i < 16; ++i)
1084
0
    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
1085
0
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1086
0
  av1_range_check_buf(0, input, output, 16, stage_range[0]);
1087
0
}
1088
1089
void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1090
0
                       const int8_t *stage_range) {
1091
0
  (void)cos_bit;
1092
0
  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1093
0
  av1_range_check_buf(0, input, output, 32, stage_range[0]);
1094
0
}
1095
1096
void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
1097
0
                const int8_t *stage_range) {
1098
0
  const int32_t size = 64;
1099
0
  const int32_t *cospi;
1100
1101
0
  int32_t stage = 0;
1102
0
  int32_t *bf0, *bf1;
1103
0
  int32_t step[64];
1104
1105
  // stage 0;
1106
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
1107
1108
  // stage 1;
1109
0
  stage++;
1110
0
  bf1 = output;
1111
0
  bf1[0] = input[0] + input[63];
1112
0
  bf1[1] = input[1] + input[62];
1113
0
  bf1[2] = input[2] + input[61];
1114
0
  bf1[3] = input[3] + input[60];
1115
0
  bf1[4] = input[4] + input[59];
1116
0
  bf1[5] = input[5] + input[58];
1117
0
  bf1[6] = input[6] + input[57];
1118
0
  bf1[7] = input[7] + input[56];
1119
0
  bf1[8] = input[8] + input[55];
1120
0
  bf1[9] = input[9] + input[54];
1121
0
  bf1[10] = input[10] + input[53];
1122
0
  bf1[11] = input[11] + input[52];
1123
0
  bf1[12] = input[12] + input[51];
1124
0
  bf1[13] = input[13] + input[50];
1125
0
  bf1[14] = input[14] + input[49];
1126
0
  bf1[15] = input[15] + input[48];
1127
0
  bf1[16] = input[16] + input[47];
1128
0
  bf1[17] = input[17] + input[46];
1129
0
  bf1[18] = input[18] + input[45];
1130
0
  bf1[19] = input[19] + input[44];
1131
0
  bf1[20] = input[20] + input[43];
1132
0
  bf1[21] = input[21] + input[42];
1133
0
  bf1[22] = input[22] + input[41];
1134
0
  bf1[23] = input[23] + input[40];
1135
0
  bf1[24] = input[24] + input[39];
1136
0
  bf1[25] = input[25] + input[38];
1137
0
  bf1[26] = input[26] + input[37];
1138
0
  bf1[27] = input[27] + input[36];
1139
0
  bf1[28] = input[28] + input[35];
1140
0
  bf1[29] = input[29] + input[34];
1141
0
  bf1[30] = input[30] + input[33];
1142
0
  bf1[31] = input[31] + input[32];
1143
0
  bf1[32] = -input[32] + input[31];
1144
0
  bf1[33] = -input[33] + input[30];
1145
0
  bf1[34] = -input[34] + input[29];
1146
0
  bf1[35] = -input[35] + input[28];
1147
0
  bf1[36] = -input[36] + input[27];
1148
0
  bf1[37] = -input[37] + input[26];
1149
0
  bf1[38] = -input[38] + input[25];
1150
0
  bf1[39] = -input[39] + input[24];
1151
0
  bf1[40] = -input[40] + input[23];
1152
0
  bf1[41] = -input[41] + input[22];
1153
0
  bf1[42] = -input[42] + input[21];
1154
0
  bf1[43] = -input[43] + input[20];
1155
0
  bf1[44] = -input[44] + input[19];
1156
0
  bf1[45] = -input[45] + input[18];
1157
0
  bf1[46] = -input[46] + input[17];
1158
0
  bf1[47] = -input[47] + input[16];
1159
0
  bf1[48] = -input[48] + input[15];
1160
0
  bf1[49] = -input[49] + input[14];
1161
0
  bf1[50] = -input[50] + input[13];
1162
0
  bf1[51] = -input[51] + input[12];
1163
0
  bf1[52] = -input[52] + input[11];
1164
0
  bf1[53] = -input[53] + input[10];
1165
0
  bf1[54] = -input[54] + input[9];
1166
0
  bf1[55] = -input[55] + input[8];
1167
0
  bf1[56] = -input[56] + input[7];
1168
0
  bf1[57] = -input[57] + input[6];
1169
0
  bf1[58] = -input[58] + input[5];
1170
0
  bf1[59] = -input[59] + input[4];
1171
0
  bf1[60] = -input[60] + input[3];
1172
0
  bf1[61] = -input[61] + input[2];
1173
0
  bf1[62] = -input[62] + input[1];
1174
0
  bf1[63] = -input[63] + input[0];
1175
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1176
1177
  // stage 2
1178
0
  stage++;
1179
0
  cospi = cospi_arr(cos_bit);
1180
0
  bf0 = output;
1181
0
  bf1 = step;
1182
0
  bf1[0] = bf0[0] + bf0[31];
1183
0
  bf1[1] = bf0[1] + bf0[30];
1184
0
  bf1[2] = bf0[2] + bf0[29];
1185
0
  bf1[3] = bf0[3] + bf0[28];
1186
0
  bf1[4] = bf0[4] + bf0[27];
1187
0
  bf1[5] = bf0[5] + bf0[26];
1188
0
  bf1[6] = bf0[6] + bf0[25];
1189
0
  bf1[7] = bf0[7] + bf0[24];
1190
0
  bf1[8] = bf0[8] + bf0[23];
1191
0
  bf1[9] = bf0[9] + bf0[22];
1192
0
  bf1[10] = bf0[10] + bf0[21];
1193
0
  bf1[11] = bf0[11] + bf0[20];
1194
0
  bf1[12] = bf0[12] + bf0[19];
1195
0
  bf1[13] = bf0[13] + bf0[18];
1196
0
  bf1[14] = bf0[14] + bf0[17];
1197
0
  bf1[15] = bf0[15] + bf0[16];
1198
0
  bf1[16] = -bf0[16] + bf0[15];
1199
0
  bf1[17] = -bf0[17] + bf0[14];
1200
0
  bf1[18] = -bf0[18] + bf0[13];
1201
0
  bf1[19] = -bf0[19] + bf0[12];
1202
0
  bf1[20] = -bf0[20] + bf0[11];
1203
0
  bf1[21] = -bf0[21] + bf0[10];
1204
0
  bf1[22] = -bf0[22] + bf0[9];
1205
0
  bf1[23] = -bf0[23] + bf0[8];
1206
0
  bf1[24] = -bf0[24] + bf0[7];
1207
0
  bf1[25] = -bf0[25] + bf0[6];
1208
0
  bf1[26] = -bf0[26] + bf0[5];
1209
0
  bf1[27] = -bf0[27] + bf0[4];
1210
0
  bf1[28] = -bf0[28] + bf0[3];
1211
0
  bf1[29] = -bf0[29] + bf0[2];
1212
0
  bf1[30] = -bf0[30] + bf0[1];
1213
0
  bf1[31] = -bf0[31] + bf0[0];
1214
0
  bf1[32] = bf0[32];
1215
0
  bf1[33] = bf0[33];
1216
0
  bf1[34] = bf0[34];
1217
0
  bf1[35] = bf0[35];
1218
0
  bf1[36] = bf0[36];
1219
0
  bf1[37] = bf0[37];
1220
0
  bf1[38] = bf0[38];
1221
0
  bf1[39] = bf0[39];
1222
0
  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1223
0
  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1224
0
  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1225
0
  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1226
0
  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1227
0
  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1228
0
  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1229
0
  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1230
0
  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
1231
0
  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
1232
0
  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
1233
0
  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
1234
0
  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
1235
0
  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
1236
0
  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
1237
0
  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
1238
0
  bf1[56] = bf0[56];
1239
0
  bf1[57] = bf0[57];
1240
0
  bf1[58] = bf0[58];
1241
0
  bf1[59] = bf0[59];
1242
0
  bf1[60] = bf0[60];
1243
0
  bf1[61] = bf0[61];
1244
0
  bf1[62] = bf0[62];
1245
0
  bf1[63] = bf0[63];
1246
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1247
1248
  // stage 3
1249
0
  stage++;
1250
0
  cospi = cospi_arr(cos_bit);
1251
0
  bf0 = step;
1252
0
  bf1 = output;
1253
0
  bf1[0] = bf0[0] + bf0[15];
1254
0
  bf1[1] = bf0[1] + bf0[14];
1255
0
  bf1[2] = bf0[2] + bf0[13];
1256
0
  bf1[3] = bf0[3] + bf0[12];
1257
0
  bf1[4] = bf0[4] + bf0[11];
1258
0
  bf1[5] = bf0[5] + bf0[10];
1259
0
  bf1[6] = bf0[6] + bf0[9];
1260
0
  bf1[7] = bf0[7] + bf0[8];
1261
0
  bf1[8] = -bf0[8] + bf0[7];
1262
0
  bf1[9] = -bf0[9] + bf0[6];
1263
0
  bf1[10] = -bf0[10] + bf0[5];
1264
0
  bf1[11] = -bf0[11] + bf0[4];
1265
0
  bf1[12] = -bf0[12] + bf0[3];
1266
0
  bf1[13] = -bf0[13] + bf0[2];
1267
0
  bf1[14] = -bf0[14] + bf0[1];
1268
0
  bf1[15] = -bf0[15] + bf0[0];
1269
0
  bf1[16] = bf0[16];
1270
0
  bf1[17] = bf0[17];
1271
0
  bf1[18] = bf0[18];
1272
0
  bf1[19] = bf0[19];
1273
0
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1274
0
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1275
0
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1276
0
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1277
0
  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1278
0
  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1279
0
  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1280
0
  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1281
0
  bf1[28] = bf0[28];
1282
0
  bf1[29] = bf0[29];
1283
0
  bf1[30] = bf0[30];
1284
0
  bf1[31] = bf0[31];
1285
0
  bf1[32] = bf0[32] + bf0[47];
1286
0
  bf1[33] = bf0[33] + bf0[46];
1287
0
  bf1[34] = bf0[34] + bf0[45];
1288
0
  bf1[35] = bf0[35] + bf0[44];
1289
0
  bf1[36] = bf0[36] + bf0[43];
1290
0
  bf1[37] = bf0[37] + bf0[42];
1291
0
  bf1[38] = bf0[38] + bf0[41];
1292
0
  bf1[39] = bf0[39] + bf0[40];
1293
0
  bf1[40] = -bf0[40] + bf0[39];
1294
0
  bf1[41] = -bf0[41] + bf0[38];
1295
0
  bf1[42] = -bf0[42] + bf0[37];
1296
0
  bf1[43] = -bf0[43] + bf0[36];
1297
0
  bf1[44] = -bf0[44] + bf0[35];
1298
0
  bf1[45] = -bf0[45] + bf0[34];
1299
0
  bf1[46] = -bf0[46] + bf0[33];
1300
0
  bf1[47] = -bf0[47] + bf0[32];
1301
0
  bf1[48] = -bf0[48] + bf0[63];
1302
0
  bf1[49] = -bf0[49] + bf0[62];
1303
0
  bf1[50] = -bf0[50] + bf0[61];
1304
0
  bf1[51] = -bf0[51] + bf0[60];
1305
0
  bf1[52] = -bf0[52] + bf0[59];
1306
0
  bf1[53] = -bf0[53] + bf0[58];
1307
0
  bf1[54] = -bf0[54] + bf0[57];
1308
0
  bf1[55] = -bf0[55] + bf0[56];
1309
0
  bf1[56] = bf0[56] + bf0[55];
1310
0
  bf1[57] = bf0[57] + bf0[54];
1311
0
  bf1[58] = bf0[58] + bf0[53];
1312
0
  bf1[59] = bf0[59] + bf0[52];
1313
0
  bf1[60] = bf0[60] + bf0[51];
1314
0
  bf1[61] = bf0[61] + bf0[50];
1315
0
  bf1[62] = bf0[62] + bf0[49];
1316
0
  bf1[63] = bf0[63] + bf0[48];
1317
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1318
1319
  // stage 4
1320
0
  stage++;
1321
0
  cospi = cospi_arr(cos_bit);
1322
0
  bf0 = output;
1323
0
  bf1 = step;
1324
0
  bf1[0] = bf0[0] + bf0[7];
1325
0
  bf1[1] = bf0[1] + bf0[6];
1326
0
  bf1[2] = bf0[2] + bf0[5];
1327
0
  bf1[3] = bf0[3] + bf0[4];
1328
0
  bf1[4] = -bf0[4] + bf0[3];
1329
0
  bf1[5] = -bf0[5] + bf0[2];
1330
0
  bf1[6] = -bf0[6] + bf0[1];
1331
0
  bf1[7] = -bf0[7] + bf0[0];
1332
0
  bf1[8] = bf0[8];
1333
0
  bf1[9] = bf0[9];
1334
0
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1335
0
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1336
0
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1337
0
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1338
0
  bf1[14] = bf0[14];
1339
0
  bf1[15] = bf0[15];
1340
0
  bf1[16] = bf0[16] + bf0[23];
1341
0
  bf1[17] = bf0[17] + bf0[22];
1342
0
  bf1[18] = bf0[18] + bf0[21];
1343
0
  bf1[19] = bf0[19] + bf0[20];
1344
0
  bf1[20] = -bf0[20] + bf0[19];
1345
0
  bf1[21] = -bf0[21] + bf0[18];
1346
0
  bf1[22] = -bf0[22] + bf0[17];
1347
0
  bf1[23] = -bf0[23] + bf0[16];
1348
0
  bf1[24] = -bf0[24] + bf0[31];
1349
0
  bf1[25] = -bf0[25] + bf0[30];
1350
0
  bf1[26] = -bf0[26] + bf0[29];
1351
0
  bf1[27] = -bf0[27] + bf0[28];
1352
0
  bf1[28] = bf0[28] + bf0[27];
1353
0
  bf1[29] = bf0[29] + bf0[26];
1354
0
  bf1[30] = bf0[30] + bf0[25];
1355
0
  bf1[31] = bf0[31] + bf0[24];
1356
0
  bf1[32] = bf0[32];
1357
0
  bf1[33] = bf0[33];
1358
0
  bf1[34] = bf0[34];
1359
0
  bf1[35] = bf0[35];
1360
0
  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1361
0
  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1362
0
  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1363
0
  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1364
0
  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1365
0
  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1366
0
  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1367
0
  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1368
0
  bf1[44] = bf0[44];
1369
0
  bf1[45] = bf0[45];
1370
0
  bf1[46] = bf0[46];
1371
0
  bf1[47] = bf0[47];
1372
0
  bf1[48] = bf0[48];
1373
0
  bf1[49] = bf0[49];
1374
0
  bf1[50] = bf0[50];
1375
0
  bf1[51] = bf0[51];
1376
0
  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
1377
0
  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
1378
0
  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
1379
0
  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
1380
0
  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
1381
0
  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
1382
0
  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
1383
0
  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
1384
0
  bf1[60] = bf0[60];
1385
0
  bf1[61] = bf0[61];
1386
0
  bf1[62] = bf0[62];
1387
0
  bf1[63] = bf0[63];
1388
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1389
1390
  // stage 5
1391
0
  stage++;
1392
0
  cospi = cospi_arr(cos_bit);
1393
0
  bf0 = step;
1394
0
  bf1 = output;
1395
0
  bf1[0] = bf0[0] + bf0[3];
1396
0
  bf1[1] = bf0[1] + bf0[2];
1397
0
  bf1[2] = -bf0[2] + bf0[1];
1398
0
  bf1[3] = -bf0[3] + bf0[0];
1399
0
  bf1[4] = bf0[4];
1400
0
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1401
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1402
0
  bf1[7] = bf0[7];
1403
0
  bf1[8] = bf0[8] + bf0[11];
1404
0
  bf1[9] = bf0[9] + bf0[10];
1405
0
  bf1[10] = -bf0[10] + bf0[9];
1406
0
  bf1[11] = -bf0[11] + bf0[8];
1407
0
  bf1[12] = -bf0[12] + bf0[15];
1408
0
  bf1[13] = -bf0[13] + bf0[14];
1409
0
  bf1[14] = bf0[14] + bf0[13];
1410
0
  bf1[15] = bf0[15] + bf0[12];
1411
0
  bf1[16] = bf0[16];
1412
0
  bf1[17] = bf0[17];
1413
0
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1414
0
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1415
0
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1416
0
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1417
0
  bf1[22] = bf0[22];
1418
0
  bf1[23] = bf0[23];
1419
0
  bf1[24] = bf0[24];
1420
0
  bf1[25] = bf0[25];
1421
0
  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1422
0
  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1423
0
  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1424
0
  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1425
0
  bf1[30] = bf0[30];
1426
0
  bf1[31] = bf0[31];
1427
0
  bf1[32] = bf0[32] + bf0[39];
1428
0
  bf1[33] = bf0[33] + bf0[38];
1429
0
  bf1[34] = bf0[34] + bf0[37];
1430
0
  bf1[35] = bf0[35] + bf0[36];
1431
0
  bf1[36] = -bf0[36] + bf0[35];
1432
0
  bf1[37] = -bf0[37] + bf0[34];
1433
0
  bf1[38] = -bf0[38] + bf0[33];
1434
0
  bf1[39] = -bf0[39] + bf0[32];
1435
0
  bf1[40] = -bf0[40] + bf0[47];
1436
0
  bf1[41] = -bf0[41] + bf0[46];
1437
0
  bf1[42] = -bf0[42] + bf0[45];
1438
0
  bf1[43] = -bf0[43] + bf0[44];
1439
0
  bf1[44] = bf0[44] + bf0[43];
1440
0
  bf1[45] = bf0[45] + bf0[42];
1441
0
  bf1[46] = bf0[46] + bf0[41];
1442
0
  bf1[47] = bf0[47] + bf0[40];
1443
0
  bf1[48] = bf0[48] + bf0[55];
1444
0
  bf1[49] = bf0[49] + bf0[54];
1445
0
  bf1[50] = bf0[50] + bf0[53];
1446
0
  bf1[51] = bf0[51] + bf0[52];
1447
0
  bf1[52] = -bf0[52] + bf0[51];
1448
0
  bf1[53] = -bf0[53] + bf0[50];
1449
0
  bf1[54] = -bf0[54] + bf0[49];
1450
0
  bf1[55] = -bf0[55] + bf0[48];
1451
0
  bf1[56] = -bf0[56] + bf0[63];
1452
0
  bf1[57] = -bf0[57] + bf0[62];
1453
0
  bf1[58] = -bf0[58] + bf0[61];
1454
0
  bf1[59] = -bf0[59] + bf0[60];
1455
0
  bf1[60] = bf0[60] + bf0[59];
1456
0
  bf1[61] = bf0[61] + bf0[58];
1457
0
  bf1[62] = bf0[62] + bf0[57];
1458
0
  bf1[63] = bf0[63] + bf0[56];
1459
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1460
1461
  // stage 6
1462
0
  stage++;
1463
0
  cospi = cospi_arr(cos_bit);
1464
0
  bf0 = output;
1465
0
  bf1 = step;
1466
0
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1467
0
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1468
0
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1469
0
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1470
0
  bf1[4] = bf0[4] + bf0[5];
1471
0
  bf1[5] = -bf0[5] + bf0[4];
1472
0
  bf1[6] = -bf0[6] + bf0[7];
1473
0
  bf1[7] = bf0[7] + bf0[6];
1474
0
  bf1[8] = bf0[8];
1475
0
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1476
0
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1477
0
  bf1[11] = bf0[11];
1478
0
  bf1[12] = bf0[12];
1479
0
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1480
0
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1481
0
  bf1[15] = bf0[15];
1482
0
  bf1[16] = bf0[16] + bf0[19];
1483
0
  bf1[17] = bf0[17] + bf0[18];
1484
0
  bf1[18] = -bf0[18] + bf0[17];
1485
0
  bf1[19] = -bf0[19] + bf0[16];
1486
0
  bf1[20] = -bf0[20] + bf0[23];
1487
0
  bf1[21] = -bf0[21] + bf0[22];
1488
0
  bf1[22] = bf0[22] + bf0[21];
1489
0
  bf1[23] = bf0[23] + bf0[20];
1490
0
  bf1[24] = bf0[24] + bf0[27];
1491
0
  bf1[25] = bf0[25] + bf0[26];
1492
0
  bf1[26] = -bf0[26] + bf0[25];
1493
0
  bf1[27] = -bf0[27] + bf0[24];
1494
0
  bf1[28] = -bf0[28] + bf0[31];
1495
0
  bf1[29] = -bf0[29] + bf0[30];
1496
0
  bf1[30] = bf0[30] + bf0[29];
1497
0
  bf1[31] = bf0[31] + bf0[28];
1498
0
  bf1[32] = bf0[32];
1499
0
  bf1[33] = bf0[33];
1500
0
  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1501
0
  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1502
0
  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1503
0
  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1504
0
  bf1[38] = bf0[38];
1505
0
  bf1[39] = bf0[39];
1506
0
  bf1[40] = bf0[40];
1507
0
  bf1[41] = bf0[41];
1508
0
  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1509
0
  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1510
0
  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1511
0
  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1512
0
  bf1[46] = bf0[46];
1513
0
  bf1[47] = bf0[47];
1514
0
  bf1[48] = bf0[48];
1515
0
  bf1[49] = bf0[49];
1516
0
  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1517
0
  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1518
0
  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1519
0
  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1520
0
  bf1[54] = bf0[54];
1521
0
  bf1[55] = bf0[55];
1522
0
  bf1[56] = bf0[56];
1523
0
  bf1[57] = bf0[57];
1524
0
  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1525
0
  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1526
0
  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1527
0
  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1528
0
  bf1[62] = bf0[62];
1529
0
  bf1[63] = bf0[63];
1530
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1531
1532
  // stage 7
1533
0
  stage++;
1534
0
  cospi = cospi_arr(cos_bit);
1535
0
  bf0 = step;
1536
0
  bf1 = output;
1537
0
  bf1[0] = bf0[0];
1538
0
  bf1[1] = bf0[1];
1539
0
  bf1[2] = bf0[2];
1540
0
  bf1[3] = bf0[3];
1541
0
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1542
0
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1543
0
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1544
0
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1545
0
  bf1[8] = bf0[8] + bf0[9];
1546
0
  bf1[9] = -bf0[9] + bf0[8];
1547
0
  bf1[10] = -bf0[10] + bf0[11];
1548
0
  bf1[11] = bf0[11] + bf0[10];
1549
0
  bf1[12] = bf0[12] + bf0[13];
1550
0
  bf1[13] = -bf0[13] + bf0[12];
1551
0
  bf1[14] = -bf0[14] + bf0[15];
1552
0
  bf1[15] = bf0[15] + bf0[14];
1553
0
  bf1[16] = bf0[16];
1554
0
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1555
0
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1556
0
  bf1[19] = bf0[19];
1557
0
  bf1[20] = bf0[20];
1558
0
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1559
0
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1560
0
  bf1[23] = bf0[23];
1561
0
  bf1[24] = bf0[24];
1562
0
  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1563
0
  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1564
0
  bf1[27] = bf0[27];
1565
0
  bf1[28] = bf0[28];
1566
0
  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1567
0
  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1568
0
  bf1[31] = bf0[31];
1569
0
  bf1[32] = bf0[32] + bf0[35];
1570
0
  bf1[33] = bf0[33] + bf0[34];
1571
0
  bf1[34] = -bf0[34] + bf0[33];
1572
0
  bf1[35] = -bf0[35] + bf0[32];
1573
0
  bf1[36] = -bf0[36] + bf0[39];
1574
0
  bf1[37] = -bf0[37] + bf0[38];
1575
0
  bf1[38] = bf0[38] + bf0[37];
1576
0
  bf1[39] = bf0[39] + bf0[36];
1577
0
  bf1[40] = bf0[40] + bf0[43];
1578
0
  bf1[41] = bf0[41] + bf0[42];
1579
0
  bf1[42] = -bf0[42] + bf0[41];
1580
0
  bf1[43] = -bf0[43] + bf0[40];
1581
0
  bf1[44] = -bf0[44] + bf0[47];
1582
0
  bf1[45] = -bf0[45] + bf0[46];
1583
0
  bf1[46] = bf0[46] + bf0[45];
1584
0
  bf1[47] = bf0[47] + bf0[44];
1585
0
  bf1[48] = bf0[48] + bf0[51];
1586
0
  bf1[49] = bf0[49] + bf0[50];
1587
0
  bf1[50] = -bf0[50] + bf0[49];
1588
0
  bf1[51] = -bf0[51] + bf0[48];
1589
0
  bf1[52] = -bf0[52] + bf0[55];
1590
0
  bf1[53] = -bf0[53] + bf0[54];
1591
0
  bf1[54] = bf0[54] + bf0[53];
1592
0
  bf1[55] = bf0[55] + bf0[52];
1593
0
  bf1[56] = bf0[56] + bf0[59];
1594
0
  bf1[57] = bf0[57] + bf0[58];
1595
0
  bf1[58] = -bf0[58] + bf0[57];
1596
0
  bf1[59] = -bf0[59] + bf0[56];
1597
0
  bf1[60] = -bf0[60] + bf0[63];
1598
0
  bf1[61] = -bf0[61] + bf0[62];
1599
0
  bf1[62] = bf0[62] + bf0[61];
1600
0
  bf1[63] = bf0[63] + bf0[60];
1601
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1602
1603
  // stage 8
1604
0
  stage++;
1605
0
  cospi = cospi_arr(cos_bit);
1606
0
  bf0 = output;
1607
0
  bf1 = step;
1608
0
  bf1[0] = bf0[0];
1609
0
  bf1[1] = bf0[1];
1610
0
  bf1[2] = bf0[2];
1611
0
  bf1[3] = bf0[3];
1612
0
  bf1[4] = bf0[4];
1613
0
  bf1[5] = bf0[5];
1614
0
  bf1[6] = bf0[6];
1615
0
  bf1[7] = bf0[7];
1616
0
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1617
0
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1618
0
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1619
0
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1620
0
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1621
0
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1622
0
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1623
0
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1624
0
  bf1[16] = bf0[16] + bf0[17];
1625
0
  bf1[17] = -bf0[17] + bf0[16];
1626
0
  bf1[18] = -bf0[18] + bf0[19];
1627
0
  bf1[19] = bf0[19] + bf0[18];
1628
0
  bf1[20] = bf0[20] + bf0[21];
1629
0
  bf1[21] = -bf0[21] + bf0[20];
1630
0
  bf1[22] = -bf0[22] + bf0[23];
1631
0
  bf1[23] = bf0[23] + bf0[22];
1632
0
  bf1[24] = bf0[24] + bf0[25];
1633
0
  bf1[25] = -bf0[25] + bf0[24];
1634
0
  bf1[26] = -bf0[26] + bf0[27];
1635
0
  bf1[27] = bf0[27] + bf0[26];
1636
0
  bf1[28] = bf0[28] + bf0[29];
1637
0
  bf1[29] = -bf0[29] + bf0[28];
1638
0
  bf1[30] = -bf0[30] + bf0[31];
1639
0
  bf1[31] = bf0[31] + bf0[30];
1640
0
  bf1[32] = bf0[32];
1641
0
  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1642
0
  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1643
0
  bf1[35] = bf0[35];
1644
0
  bf1[36] = bf0[36];
1645
0
  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1646
0
  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1647
0
  bf1[39] = bf0[39];
1648
0
  bf1[40] = bf0[40];
1649
0
  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1650
0
  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1651
0
  bf1[43] = bf0[43];
1652
0
  bf1[44] = bf0[44];
1653
0
  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1654
0
  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1655
0
  bf1[47] = bf0[47];
1656
0
  bf1[48] = bf0[48];
1657
0
  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1658
0
  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1659
0
  bf1[51] = bf0[51];
1660
0
  bf1[52] = bf0[52];
1661
0
  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1662
0
  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1663
0
  bf1[55] = bf0[55];
1664
0
  bf1[56] = bf0[56];
1665
0
  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1666
0
  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1667
0
  bf1[59] = bf0[59];
1668
0
  bf1[60] = bf0[60];
1669
0
  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1670
0
  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1671
0
  bf1[63] = bf0[63];
1672
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1673
1674
  // stage 9
1675
0
  stage++;
1676
0
  cospi = cospi_arr(cos_bit);
1677
0
  bf0 = step;
1678
0
  bf1 = output;
1679
0
  bf1[0] = bf0[0];
1680
0
  bf1[1] = bf0[1];
1681
0
  bf1[2] = bf0[2];
1682
0
  bf1[3] = bf0[3];
1683
0
  bf1[4] = bf0[4];
1684
0
  bf1[5] = bf0[5];
1685
0
  bf1[6] = bf0[6];
1686
0
  bf1[7] = bf0[7];
1687
0
  bf1[8] = bf0[8];
1688
0
  bf1[9] = bf0[9];
1689
0
  bf1[10] = bf0[10];
1690
0
  bf1[11] = bf0[11];
1691
0
  bf1[12] = bf0[12];
1692
0
  bf1[13] = bf0[13];
1693
0
  bf1[14] = bf0[14];
1694
0
  bf1[15] = bf0[15];
1695
0
  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1696
0
  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1697
0
  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1698
0
  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1699
0
  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1700
0
  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1701
0
  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1702
0
  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1703
0
  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1704
0
  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1705
0
  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1706
0
  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1707
0
  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1708
0
  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1709
0
  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1710
0
  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1711
0
  bf1[32] = bf0[32] + bf0[33];
1712
0
  bf1[33] = -bf0[33] + bf0[32];
1713
0
  bf1[34] = -bf0[34] + bf0[35];
1714
0
  bf1[35] = bf0[35] + bf0[34];
1715
0
  bf1[36] = bf0[36] + bf0[37];
1716
0
  bf1[37] = -bf0[37] + bf0[36];
1717
0
  bf1[38] = -bf0[38] + bf0[39];
1718
0
  bf1[39] = bf0[39] + bf0[38];
1719
0
  bf1[40] = bf0[40] + bf0[41];
1720
0
  bf1[41] = -bf0[41] + bf0[40];
1721
0
  bf1[42] = -bf0[42] + bf0[43];
1722
0
  bf1[43] = bf0[43] + bf0[42];
1723
0
  bf1[44] = bf0[44] + bf0[45];
1724
0
  bf1[45] = -bf0[45] + bf0[44];
1725
0
  bf1[46] = -bf0[46] + bf0[47];
1726
0
  bf1[47] = bf0[47] + bf0[46];
1727
0
  bf1[48] = bf0[48] + bf0[49];
1728
0
  bf1[49] = -bf0[49] + bf0[48];
1729
0
  bf1[50] = -bf0[50] + bf0[51];
1730
0
  bf1[51] = bf0[51] + bf0[50];
1731
0
  bf1[52] = bf0[52] + bf0[53];
1732
0
  bf1[53] = -bf0[53] + bf0[52];
1733
0
  bf1[54] = -bf0[54] + bf0[55];
1734
0
  bf1[55] = bf0[55] + bf0[54];
1735
0
  bf1[56] = bf0[56] + bf0[57];
1736
0
  bf1[57] = -bf0[57] + bf0[56];
1737
0
  bf1[58] = -bf0[58] + bf0[59];
1738
0
  bf1[59] = bf0[59] + bf0[58];
1739
0
  bf1[60] = bf0[60] + bf0[61];
1740
0
  bf1[61] = -bf0[61] + bf0[60];
1741
0
  bf1[62] = -bf0[62] + bf0[63];
1742
0
  bf1[63] = bf0[63] + bf0[62];
1743
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1744
1745
  // stage 10
1746
0
  stage++;
1747
0
  cospi = cospi_arr(cos_bit);
1748
0
  bf0 = output;
1749
0
  bf1 = step;
1750
0
  bf1[0] = bf0[0];
1751
0
  bf1[1] = bf0[1];
1752
0
  bf1[2] = bf0[2];
1753
0
  bf1[3] = bf0[3];
1754
0
  bf1[4] = bf0[4];
1755
0
  bf1[5] = bf0[5];
1756
0
  bf1[6] = bf0[6];
1757
0
  bf1[7] = bf0[7];
1758
0
  bf1[8] = bf0[8];
1759
0
  bf1[9] = bf0[9];
1760
0
  bf1[10] = bf0[10];
1761
0
  bf1[11] = bf0[11];
1762
0
  bf1[12] = bf0[12];
1763
0
  bf1[13] = bf0[13];
1764
0
  bf1[14] = bf0[14];
1765
0
  bf1[15] = bf0[15];
1766
0
  bf1[16] = bf0[16];
1767
0
  bf1[17] = bf0[17];
1768
0
  bf1[18] = bf0[18];
1769
0
  bf1[19] = bf0[19];
1770
0
  bf1[20] = bf0[20];
1771
0
  bf1[21] = bf0[21];
1772
0
  bf1[22] = bf0[22];
1773
0
  bf1[23] = bf0[23];
1774
0
  bf1[24] = bf0[24];
1775
0
  bf1[25] = bf0[25];
1776
0
  bf1[26] = bf0[26];
1777
0
  bf1[27] = bf0[27];
1778
0
  bf1[28] = bf0[28];
1779
0
  bf1[29] = bf0[29];
1780
0
  bf1[30] = bf0[30];
1781
0
  bf1[31] = bf0[31];
1782
0
  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1783
0
  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1784
0
  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1785
0
  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1786
0
  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1787
0
  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1788
0
  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1789
0
  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1790
0
  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1791
0
  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1792
0
  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1793
0
  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1794
0
  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1795
0
  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1796
0
  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1797
0
  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1798
0
  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1799
0
  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1800
0
  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1801
0
  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1802
0
  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1803
0
  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1804
0
  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1805
0
  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1806
0
  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1807
0
  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1808
0
  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1809
0
  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1810
0
  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1811
0
  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1812
0
  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1813
0
  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1814
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1815
1816
  // stage 11
1817
0
  stage++;
1818
0
  bf0 = step;
1819
0
  bf1 = output;
1820
0
  bf1[0] = bf0[0];
1821
0
  bf1[1] = bf0[32];
1822
0
  bf1[2] = bf0[16];
1823
0
  bf1[3] = bf0[48];
1824
0
  bf1[4] = bf0[8];
1825
0
  bf1[5] = bf0[40];
1826
0
  bf1[6] = bf0[24];
1827
0
  bf1[7] = bf0[56];
1828
0
  bf1[8] = bf0[4];
1829
0
  bf1[9] = bf0[36];
1830
0
  bf1[10] = bf0[20];
1831
0
  bf1[11] = bf0[52];
1832
0
  bf1[12] = bf0[12];
1833
0
  bf1[13] = bf0[44];
1834
0
  bf1[14] = bf0[28];
1835
0
  bf1[15] = bf0[60];
1836
0
  bf1[16] = bf0[2];
1837
0
  bf1[17] = bf0[34];
1838
0
  bf1[18] = bf0[18];
1839
0
  bf1[19] = bf0[50];
1840
0
  bf1[20] = bf0[10];
1841
0
  bf1[21] = bf0[42];
1842
0
  bf1[22] = bf0[26];
1843
0
  bf1[23] = bf0[58];
1844
0
  bf1[24] = bf0[6];
1845
0
  bf1[25] = bf0[38];
1846
0
  bf1[26] = bf0[22];
1847
0
  bf1[27] = bf0[54];
1848
0
  bf1[28] = bf0[14];
1849
0
  bf1[29] = bf0[46];
1850
0
  bf1[30] = bf0[30];
1851
0
  bf1[31] = bf0[62];
1852
0
  bf1[32] = bf0[1];
1853
0
  bf1[33] = bf0[33];
1854
0
  bf1[34] = bf0[17];
1855
0
  bf1[35] = bf0[49];
1856
0
  bf1[36] = bf0[9];
1857
0
  bf1[37] = bf0[41];
1858
0
  bf1[38] = bf0[25];
1859
0
  bf1[39] = bf0[57];
1860
0
  bf1[40] = bf0[5];
1861
0
  bf1[41] = bf0[37];
1862
0
  bf1[42] = bf0[21];
1863
0
  bf1[43] = bf0[53];
1864
0
  bf1[44] = bf0[13];
1865
0
  bf1[45] = bf0[45];
1866
0
  bf1[46] = bf0[29];
1867
0
  bf1[47] = bf0[61];
1868
0
  bf1[48] = bf0[3];
1869
0
  bf1[49] = bf0[35];
1870
0
  bf1[50] = bf0[19];
1871
0
  bf1[51] = bf0[51];
1872
0
  bf1[52] = bf0[11];
1873
0
  bf1[53] = bf0[43];
1874
0
  bf1[54] = bf0[27];
1875
0
  bf1[55] = bf0[59];
1876
0
  bf1[56] = bf0[7];
1877
0
  bf1[57] = bf0[39];
1878
0
  bf1[58] = bf0[23];
1879
0
  bf1[59] = bf0[55];
1880
0
  bf1[60] = bf0[15];
1881
0
  bf1[61] = bf0[47];
1882
0
  bf1[62] = bf0[31];
1883
0
  bf1[63] = bf0[63];
1884
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1885
0
}