Coverage Report

Created: 2026-02-26 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/aom/av1/encoder/av1_fwd_txfm1d.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <stdlib.h>
13
#include "av1/encoder/av1_fwd_txfm1d.h"
14
#include "av1/common/av1_txfm.h"
15
16
void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
17
1.31M
               const int8_t *stage_range) {
18
1.31M
  const int32_t size = 4;
19
1.31M
  const int32_t *cospi;
20
21
1.31M
  int32_t stage = 0;
22
1.31M
  int32_t *bf0, *bf1;
23
1.31M
  int32_t step[4];
24
25
  // stage 0;
26
1.31M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
27
28
  // stage 1;
29
1.31M
  stage++;
30
1.31M
  bf1 = output;
31
1.31M
  bf1[0] = input[0] + input[3];
32
1.31M
  bf1[1] = input[1] + input[2];
33
1.31M
  bf1[2] = -input[2] + input[1];
34
1.31M
  bf1[3] = -input[3] + input[0];
35
1.31M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36
37
  // stage 2
38
1.31M
  stage++;
39
1.31M
  cospi = cospi_arr(cos_bit);
40
1.31M
  bf0 = output;
41
1.31M
  bf1 = step;
42
1.31M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
43
1.31M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
44
1.31M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
45
1.31M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
46
1.31M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
47
48
  // stage 3
49
1.31M
  stage++;
50
1.31M
  bf0 = step;
51
1.31M
  bf1 = output;
52
1.31M
  bf1[0] = bf0[0];
53
1.31M
  bf1[1] = bf0[2];
54
1.31M
  bf1[2] = bf0[1];
55
1.31M
  bf1[3] = bf0[3];
56
1.31M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
57
1.31M
}
58
59
void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
60
4.98M
               const int8_t *stage_range) {
61
4.98M
  const int32_t size = 8;
62
4.98M
  const int32_t *cospi;
63
64
4.98M
  int32_t stage = 0;
65
4.98M
  int32_t *bf0, *bf1;
66
4.98M
  int32_t step[8];
67
68
  // stage 0;
69
4.98M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
70
71
  // stage 1;
72
4.98M
  stage++;
73
4.98M
  bf1 = output;
74
4.98M
  bf1[0] = input[0] + input[7];
75
4.98M
  bf1[1] = input[1] + input[6];
76
4.98M
  bf1[2] = input[2] + input[5];
77
4.98M
  bf1[3] = input[3] + input[4];
78
4.98M
  bf1[4] = -input[4] + input[3];
79
4.98M
  bf1[5] = -input[5] + input[2];
80
4.98M
  bf1[6] = -input[6] + input[1];
81
4.98M
  bf1[7] = -input[7] + input[0];
82
4.98M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
83
84
  // stage 2
85
4.98M
  stage++;
86
4.98M
  cospi = cospi_arr(cos_bit);
87
4.98M
  bf0 = output;
88
4.98M
  bf1 = step;
89
4.98M
  bf1[0] = bf0[0] + bf0[3];
90
4.98M
  bf1[1] = bf0[1] + bf0[2];
91
4.98M
  bf1[2] = -bf0[2] + bf0[1];
92
4.98M
  bf1[3] = -bf0[3] + bf0[0];
93
4.98M
  bf1[4] = bf0[4];
94
4.98M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
95
4.98M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
96
4.98M
  bf1[7] = bf0[7];
97
4.98M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
98
99
  // stage 3
100
4.98M
  stage++;
101
4.98M
  cospi = cospi_arr(cos_bit);
102
4.98M
  bf0 = step;
103
4.98M
  bf1 = output;
104
4.98M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
105
4.98M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
106
4.98M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
107
4.98M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
108
4.98M
  bf1[4] = bf0[4] + bf0[5];
109
4.98M
  bf1[5] = -bf0[5] + bf0[4];
110
4.98M
  bf1[6] = -bf0[6] + bf0[7];
111
4.98M
  bf1[7] = bf0[7] + bf0[6];
112
4.98M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
113
114
  // stage 4
115
4.98M
  stage++;
116
4.98M
  cospi = cospi_arr(cos_bit);
117
4.98M
  bf0 = output;
118
4.98M
  bf1 = step;
119
4.98M
  bf1[0] = bf0[0];
120
4.98M
  bf1[1] = bf0[1];
121
4.98M
  bf1[2] = bf0[2];
122
4.98M
  bf1[3] = bf0[3];
123
4.98M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
124
4.98M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
125
4.98M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
126
4.98M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
127
4.98M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
128
129
  // stage 5
130
4.98M
  stage++;
131
4.98M
  bf0 = step;
132
4.98M
  bf1 = output;
133
4.98M
  bf1[0] = bf0[0];
134
4.98M
  bf1[1] = bf0[4];
135
4.98M
  bf1[2] = bf0[2];
136
4.98M
  bf1[3] = bf0[6];
137
4.98M
  bf1[4] = bf0[1];
138
4.98M
  bf1[5] = bf0[5];
139
4.98M
  bf1[6] = bf0[3];
140
4.98M
  bf1[7] = bf0[7];
141
4.98M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
142
4.98M
}
143
144
void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
145
10.7M
                const int8_t *stage_range) {
146
10.7M
  const int32_t size = 16;
147
10.7M
  const int32_t *cospi;
148
149
10.7M
  int32_t stage = 0;
150
10.7M
  int32_t *bf0, *bf1;
151
10.7M
  int32_t step[16];
152
153
  // stage 0;
154
10.7M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
155
156
  // stage 1;
157
10.7M
  stage++;
158
10.7M
  bf1 = output;
159
10.7M
  bf1[0] = input[0] + input[15];
160
10.7M
  bf1[1] = input[1] + input[14];
161
10.7M
  bf1[2] = input[2] + input[13];
162
10.7M
  bf1[3] = input[3] + input[12];
163
10.7M
  bf1[4] = input[4] + input[11];
164
10.7M
  bf1[5] = input[5] + input[10];
165
10.7M
  bf1[6] = input[6] + input[9];
166
10.7M
  bf1[7] = input[7] + input[8];
167
10.7M
  bf1[8] = -input[8] + input[7];
168
10.7M
  bf1[9] = -input[9] + input[6];
169
10.7M
  bf1[10] = -input[10] + input[5];
170
10.7M
  bf1[11] = -input[11] + input[4];
171
10.7M
  bf1[12] = -input[12] + input[3];
172
10.7M
  bf1[13] = -input[13] + input[2];
173
10.7M
  bf1[14] = -input[14] + input[1];
174
10.7M
  bf1[15] = -input[15] + input[0];
175
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
176
177
  // stage 2
178
10.7M
  stage++;
179
10.7M
  cospi = cospi_arr(cos_bit);
180
10.7M
  bf0 = output;
181
10.7M
  bf1 = step;
182
10.7M
  bf1[0] = bf0[0] + bf0[7];
183
10.7M
  bf1[1] = bf0[1] + bf0[6];
184
10.7M
  bf1[2] = bf0[2] + bf0[5];
185
10.7M
  bf1[3] = bf0[3] + bf0[4];
186
10.7M
  bf1[4] = -bf0[4] + bf0[3];
187
10.7M
  bf1[5] = -bf0[5] + bf0[2];
188
10.7M
  bf1[6] = -bf0[6] + bf0[1];
189
10.7M
  bf1[7] = -bf0[7] + bf0[0];
190
10.7M
  bf1[8] = bf0[8];
191
10.7M
  bf1[9] = bf0[9];
192
10.7M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
193
10.7M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
194
10.7M
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
195
10.7M
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
196
10.7M
  bf1[14] = bf0[14];
197
10.7M
  bf1[15] = bf0[15];
198
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
199
200
  // stage 3
201
10.7M
  stage++;
202
10.7M
  cospi = cospi_arr(cos_bit);
203
10.7M
  bf0 = step;
204
10.7M
  bf1 = output;
205
10.7M
  bf1[0] = bf0[0] + bf0[3];
206
10.7M
  bf1[1] = bf0[1] + bf0[2];
207
10.7M
  bf1[2] = -bf0[2] + bf0[1];
208
10.7M
  bf1[3] = -bf0[3] + bf0[0];
209
10.7M
  bf1[4] = bf0[4];
210
10.7M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
211
10.7M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
212
10.7M
  bf1[7] = bf0[7];
213
10.7M
  bf1[8] = bf0[8] + bf0[11];
214
10.7M
  bf1[9] = bf0[9] + bf0[10];
215
10.7M
  bf1[10] = -bf0[10] + bf0[9];
216
10.7M
  bf1[11] = -bf0[11] + bf0[8];
217
10.7M
  bf1[12] = -bf0[12] + bf0[15];
218
10.7M
  bf1[13] = -bf0[13] + bf0[14];
219
10.7M
  bf1[14] = bf0[14] + bf0[13];
220
10.7M
  bf1[15] = bf0[15] + bf0[12];
221
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
222
223
  // stage 4
224
10.7M
  stage++;
225
10.7M
  cospi = cospi_arr(cos_bit);
226
10.7M
  bf0 = output;
227
10.7M
  bf1 = step;
228
10.7M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
229
10.7M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
230
10.7M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
231
10.7M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
232
10.7M
  bf1[4] = bf0[4] + bf0[5];
233
10.7M
  bf1[5] = -bf0[5] + bf0[4];
234
10.7M
  bf1[6] = -bf0[6] + bf0[7];
235
10.7M
  bf1[7] = bf0[7] + bf0[6];
236
10.7M
  bf1[8] = bf0[8];
237
10.7M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
238
10.7M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
239
10.7M
  bf1[11] = bf0[11];
240
10.7M
  bf1[12] = bf0[12];
241
10.7M
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
242
10.7M
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
243
10.7M
  bf1[15] = bf0[15];
244
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
245
246
  // stage 5
247
10.7M
  stage++;
248
10.7M
  cospi = cospi_arr(cos_bit);
249
10.7M
  bf0 = step;
250
10.7M
  bf1 = output;
251
10.7M
  bf1[0] = bf0[0];
252
10.7M
  bf1[1] = bf0[1];
253
10.7M
  bf1[2] = bf0[2];
254
10.7M
  bf1[3] = bf0[3];
255
10.7M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
256
10.7M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
257
10.7M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
258
10.7M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
259
10.7M
  bf1[8] = bf0[8] + bf0[9];
260
10.7M
  bf1[9] = -bf0[9] + bf0[8];
261
10.7M
  bf1[10] = -bf0[10] + bf0[11];
262
10.7M
  bf1[11] = bf0[11] + bf0[10];
263
10.7M
  bf1[12] = bf0[12] + bf0[13];
264
10.7M
  bf1[13] = -bf0[13] + bf0[12];
265
10.7M
  bf1[14] = -bf0[14] + bf0[15];
266
10.7M
  bf1[15] = bf0[15] + bf0[14];
267
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
268
269
  // stage 6
270
10.7M
  stage++;
271
10.7M
  cospi = cospi_arr(cos_bit);
272
10.7M
  bf0 = output;
273
10.7M
  bf1 = step;
274
10.7M
  bf1[0] = bf0[0];
275
10.7M
  bf1[1] = bf0[1];
276
10.7M
  bf1[2] = bf0[2];
277
10.7M
  bf1[3] = bf0[3];
278
10.7M
  bf1[4] = bf0[4];
279
10.7M
  bf1[5] = bf0[5];
280
10.7M
  bf1[6] = bf0[6];
281
10.7M
  bf1[7] = bf0[7];
282
10.7M
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
283
10.7M
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
284
10.7M
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
285
10.7M
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
286
10.7M
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
287
10.7M
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
288
10.7M
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
289
10.7M
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
290
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
291
292
  // stage 7
293
10.7M
  stage++;
294
10.7M
  bf0 = step;
295
10.7M
  bf1 = output;
296
10.7M
  bf1[0] = bf0[0];
297
10.7M
  bf1[1] = bf0[8];
298
10.7M
  bf1[2] = bf0[4];
299
10.7M
  bf1[3] = bf0[12];
300
10.7M
  bf1[4] = bf0[2];
301
10.7M
  bf1[5] = bf0[10];
302
10.7M
  bf1[6] = bf0[6];
303
10.7M
  bf1[7] = bf0[14];
304
10.7M
  bf1[8] = bf0[1];
305
10.7M
  bf1[9] = bf0[9];
306
10.7M
  bf1[10] = bf0[5];
307
10.7M
  bf1[11] = bf0[13];
308
10.7M
  bf1[12] = bf0[3];
309
10.7M
  bf1[13] = bf0[11];
310
10.7M
  bf1[14] = bf0[7];
311
10.7M
  bf1[15] = bf0[15];
312
10.7M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
313
10.7M
}
314
315
void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
316
15.1M
                const int8_t *stage_range) {
317
15.1M
  const int32_t size = 32;
318
15.1M
  const int32_t *cospi;
319
320
15.1M
  int32_t stage = 0;
321
15.1M
  int32_t *bf0, *bf1;
322
15.1M
  int32_t step[32];
323
324
  // stage 0;
325
15.1M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
326
327
  // stage 1;
328
15.1M
  stage++;
329
15.1M
  bf1 = output;
330
15.1M
  bf1[0] = input[0] + input[31];
331
15.1M
  bf1[1] = input[1] + input[30];
332
15.1M
  bf1[2] = input[2] + input[29];
333
15.1M
  bf1[3] = input[3] + input[28];
334
15.1M
  bf1[4] = input[4] + input[27];
335
15.1M
  bf1[5] = input[5] + input[26];
336
15.1M
  bf1[6] = input[6] + input[25];
337
15.1M
  bf1[7] = input[7] + input[24];
338
15.1M
  bf1[8] = input[8] + input[23];
339
15.1M
  bf1[9] = input[9] + input[22];
340
15.1M
  bf1[10] = input[10] + input[21];
341
15.1M
  bf1[11] = input[11] + input[20];
342
15.1M
  bf1[12] = input[12] + input[19];
343
15.1M
  bf1[13] = input[13] + input[18];
344
15.1M
  bf1[14] = input[14] + input[17];
345
15.1M
  bf1[15] = input[15] + input[16];
346
15.1M
  bf1[16] = -input[16] + input[15];
347
15.1M
  bf1[17] = -input[17] + input[14];
348
15.1M
  bf1[18] = -input[18] + input[13];
349
15.1M
  bf1[19] = -input[19] + input[12];
350
15.1M
  bf1[20] = -input[20] + input[11];
351
15.1M
  bf1[21] = -input[21] + input[10];
352
15.1M
  bf1[22] = -input[22] + input[9];
353
15.1M
  bf1[23] = -input[23] + input[8];
354
15.1M
  bf1[24] = -input[24] + input[7];
355
15.1M
  bf1[25] = -input[25] + input[6];
356
15.1M
  bf1[26] = -input[26] + input[5];
357
15.1M
  bf1[27] = -input[27] + input[4];
358
15.1M
  bf1[28] = -input[28] + input[3];
359
15.1M
  bf1[29] = -input[29] + input[2];
360
15.1M
  bf1[30] = -input[30] + input[1];
361
15.1M
  bf1[31] = -input[31] + input[0];
362
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
363
364
  // stage 2
365
15.1M
  stage++;
366
15.1M
  cospi = cospi_arr(cos_bit);
367
15.1M
  bf0 = output;
368
15.1M
  bf1 = step;
369
15.1M
  bf1[0] = bf0[0] + bf0[15];
370
15.1M
  bf1[1] = bf0[1] + bf0[14];
371
15.1M
  bf1[2] = bf0[2] + bf0[13];
372
15.1M
  bf1[3] = bf0[3] + bf0[12];
373
15.1M
  bf1[4] = bf0[4] + bf0[11];
374
15.1M
  bf1[5] = bf0[5] + bf0[10];
375
15.1M
  bf1[6] = bf0[6] + bf0[9];
376
15.1M
  bf1[7] = bf0[7] + bf0[8];
377
15.1M
  bf1[8] = -bf0[8] + bf0[7];
378
15.1M
  bf1[9] = -bf0[9] + bf0[6];
379
15.1M
  bf1[10] = -bf0[10] + bf0[5];
380
15.1M
  bf1[11] = -bf0[11] + bf0[4];
381
15.1M
  bf1[12] = -bf0[12] + bf0[3];
382
15.1M
  bf1[13] = -bf0[13] + bf0[2];
383
15.1M
  bf1[14] = -bf0[14] + bf0[1];
384
15.1M
  bf1[15] = -bf0[15] + bf0[0];
385
15.1M
  bf1[16] = bf0[16];
386
15.1M
  bf1[17] = bf0[17];
387
15.1M
  bf1[18] = bf0[18];
388
15.1M
  bf1[19] = bf0[19];
389
15.1M
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
390
15.1M
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
391
15.1M
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
392
15.1M
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
393
15.1M
  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
394
15.1M
  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
395
15.1M
  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
396
15.1M
  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
397
15.1M
  bf1[28] = bf0[28];
398
15.1M
  bf1[29] = bf0[29];
399
15.1M
  bf1[30] = bf0[30];
400
15.1M
  bf1[31] = bf0[31];
401
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
402
403
  // stage 3
404
15.1M
  stage++;
405
15.1M
  cospi = cospi_arr(cos_bit);
406
15.1M
  bf0 = step;
407
15.1M
  bf1 = output;
408
15.1M
  bf1[0] = bf0[0] + bf0[7];
409
15.1M
  bf1[1] = bf0[1] + bf0[6];
410
15.1M
  bf1[2] = bf0[2] + bf0[5];
411
15.1M
  bf1[3] = bf0[3] + bf0[4];
412
15.1M
  bf1[4] = -bf0[4] + bf0[3];
413
15.1M
  bf1[5] = -bf0[5] + bf0[2];
414
15.1M
  bf1[6] = -bf0[6] + bf0[1];
415
15.1M
  bf1[7] = -bf0[7] + bf0[0];
416
15.1M
  bf1[8] = bf0[8];
417
15.1M
  bf1[9] = bf0[9];
418
15.1M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
419
15.1M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
420
15.1M
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
421
15.1M
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
422
15.1M
  bf1[14] = bf0[14];
423
15.1M
  bf1[15] = bf0[15];
424
15.1M
  bf1[16] = bf0[16] + bf0[23];
425
15.1M
  bf1[17] = bf0[17] + bf0[22];
426
15.1M
  bf1[18] = bf0[18] + bf0[21];
427
15.1M
  bf1[19] = bf0[19] + bf0[20];
428
15.1M
  bf1[20] = -bf0[20] + bf0[19];
429
15.1M
  bf1[21] = -bf0[21] + bf0[18];
430
15.1M
  bf1[22] = -bf0[22] + bf0[17];
431
15.1M
  bf1[23] = -bf0[23] + bf0[16];
432
15.1M
  bf1[24] = -bf0[24] + bf0[31];
433
15.1M
  bf1[25] = -bf0[25] + bf0[30];
434
15.1M
  bf1[26] = -bf0[26] + bf0[29];
435
15.1M
  bf1[27] = -bf0[27] + bf0[28];
436
15.1M
  bf1[28] = bf0[28] + bf0[27];
437
15.1M
  bf1[29] = bf0[29] + bf0[26];
438
15.1M
  bf1[30] = bf0[30] + bf0[25];
439
15.1M
  bf1[31] = bf0[31] + bf0[24];
440
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
441
442
  // stage 4
443
15.1M
  stage++;
444
15.1M
  cospi = cospi_arr(cos_bit);
445
15.1M
  bf0 = output;
446
15.1M
  bf1 = step;
447
15.1M
  bf1[0] = bf0[0] + bf0[3];
448
15.1M
  bf1[1] = bf0[1] + bf0[2];
449
15.1M
  bf1[2] = -bf0[2] + bf0[1];
450
15.1M
  bf1[3] = -bf0[3] + bf0[0];
451
15.1M
  bf1[4] = bf0[4];
452
15.1M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
453
15.1M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
454
15.1M
  bf1[7] = bf0[7];
455
15.1M
  bf1[8] = bf0[8] + bf0[11];
456
15.1M
  bf1[9] = bf0[9] + bf0[10];
457
15.1M
  bf1[10] = -bf0[10] + bf0[9];
458
15.1M
  bf1[11] = -bf0[11] + bf0[8];
459
15.1M
  bf1[12] = -bf0[12] + bf0[15];
460
15.1M
  bf1[13] = -bf0[13] + bf0[14];
461
15.1M
  bf1[14] = bf0[14] + bf0[13];
462
15.1M
  bf1[15] = bf0[15] + bf0[12];
463
15.1M
  bf1[16] = bf0[16];
464
15.1M
  bf1[17] = bf0[17];
465
15.1M
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
466
15.1M
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
467
15.1M
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
468
15.1M
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
469
15.1M
  bf1[22] = bf0[22];
470
15.1M
  bf1[23] = bf0[23];
471
15.1M
  bf1[24] = bf0[24];
472
15.1M
  bf1[25] = bf0[25];
473
15.1M
  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
474
15.1M
  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
475
15.1M
  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
476
15.1M
  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
477
15.1M
  bf1[30] = bf0[30];
478
15.1M
  bf1[31] = bf0[31];
479
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
480
481
  // stage 5
482
15.1M
  stage++;
483
15.1M
  cospi = cospi_arr(cos_bit);
484
15.1M
  bf0 = step;
485
15.1M
  bf1 = output;
486
15.1M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
487
15.1M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
488
15.1M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
489
15.1M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
490
15.1M
  bf1[4] = bf0[4] + bf0[5];
491
15.1M
  bf1[5] = -bf0[5] + bf0[4];
492
15.1M
  bf1[6] = -bf0[6] + bf0[7];
493
15.1M
  bf1[7] = bf0[7] + bf0[6];
494
15.1M
  bf1[8] = bf0[8];
495
15.1M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
496
15.1M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
497
15.1M
  bf1[11] = bf0[11];
498
15.1M
  bf1[12] = bf0[12];
499
15.1M
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
500
15.1M
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
501
15.1M
  bf1[15] = bf0[15];
502
15.1M
  bf1[16] = bf0[16] + bf0[19];
503
15.1M
  bf1[17] = bf0[17] + bf0[18];
504
15.1M
  bf1[18] = -bf0[18] + bf0[17];
505
15.1M
  bf1[19] = -bf0[19] + bf0[16];
506
15.1M
  bf1[20] = -bf0[20] + bf0[23];
507
15.1M
  bf1[21] = -bf0[21] + bf0[22];
508
15.1M
  bf1[22] = bf0[22] + bf0[21];
509
15.1M
  bf1[23] = bf0[23] + bf0[20];
510
15.1M
  bf1[24] = bf0[24] + bf0[27];
511
15.1M
  bf1[25] = bf0[25] + bf0[26];
512
15.1M
  bf1[26] = -bf0[26] + bf0[25];
513
15.1M
  bf1[27] = -bf0[27] + bf0[24];
514
15.1M
  bf1[28] = -bf0[28] + bf0[31];
515
15.1M
  bf1[29] = -bf0[29] + bf0[30];
516
15.1M
  bf1[30] = bf0[30] + bf0[29];
517
15.1M
  bf1[31] = bf0[31] + bf0[28];
518
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
519
520
  // stage 6
521
15.1M
  stage++;
522
15.1M
  cospi = cospi_arr(cos_bit);
523
15.1M
  bf0 = output;
524
15.1M
  bf1 = step;
525
15.1M
  bf1[0] = bf0[0];
526
15.1M
  bf1[1] = bf0[1];
527
15.1M
  bf1[2] = bf0[2];
528
15.1M
  bf1[3] = bf0[3];
529
15.1M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
530
15.1M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
531
15.1M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
532
15.1M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
533
15.1M
  bf1[8] = bf0[8] + bf0[9];
534
15.1M
  bf1[9] = -bf0[9] + bf0[8];
535
15.1M
  bf1[10] = -bf0[10] + bf0[11];
536
15.1M
  bf1[11] = bf0[11] + bf0[10];
537
15.1M
  bf1[12] = bf0[12] + bf0[13];
538
15.1M
  bf1[13] = -bf0[13] + bf0[12];
539
15.1M
  bf1[14] = -bf0[14] + bf0[15];
540
15.1M
  bf1[15] = bf0[15] + bf0[14];
541
15.1M
  bf1[16] = bf0[16];
542
15.1M
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
543
15.1M
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
544
15.1M
  bf1[19] = bf0[19];
545
15.1M
  bf1[20] = bf0[20];
546
15.1M
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
547
15.1M
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
548
15.1M
  bf1[23] = bf0[23];
549
15.1M
  bf1[24] = bf0[24];
550
15.1M
  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
551
15.1M
  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
552
15.1M
  bf1[27] = bf0[27];
553
15.1M
  bf1[28] = bf0[28];
554
15.1M
  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
555
15.1M
  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
556
15.1M
  bf1[31] = bf0[31];
557
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
558
559
  // stage 7
560
15.1M
  stage++;
561
15.1M
  cospi = cospi_arr(cos_bit);
562
15.1M
  bf0 = step;
563
15.1M
  bf1 = output;
564
15.1M
  bf1[0] = bf0[0];
565
15.1M
  bf1[1] = bf0[1];
566
15.1M
  bf1[2] = bf0[2];
567
15.1M
  bf1[3] = bf0[3];
568
15.1M
  bf1[4] = bf0[4];
569
15.1M
  bf1[5] = bf0[5];
570
15.1M
  bf1[6] = bf0[6];
571
15.1M
  bf1[7] = bf0[7];
572
15.1M
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
573
15.1M
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
574
15.1M
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
575
15.1M
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
576
15.1M
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
577
15.1M
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
578
15.1M
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
579
15.1M
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
580
15.1M
  bf1[16] = bf0[16] + bf0[17];
581
15.1M
  bf1[17] = -bf0[17] + bf0[16];
582
15.1M
  bf1[18] = -bf0[18] + bf0[19];
583
15.1M
  bf1[19] = bf0[19] + bf0[18];
584
15.1M
  bf1[20] = bf0[20] + bf0[21];
585
15.1M
  bf1[21] = -bf0[21] + bf0[20];
586
15.1M
  bf1[22] = -bf0[22] + bf0[23];
587
15.1M
  bf1[23] = bf0[23] + bf0[22];
588
15.1M
  bf1[24] = bf0[24] + bf0[25];
589
15.1M
  bf1[25] = -bf0[25] + bf0[24];
590
15.1M
  bf1[26] = -bf0[26] + bf0[27];
591
15.1M
  bf1[27] = bf0[27] + bf0[26];
592
15.1M
  bf1[28] = bf0[28] + bf0[29];
593
15.1M
  bf1[29] = -bf0[29] + bf0[28];
594
15.1M
  bf1[30] = -bf0[30] + bf0[31];
595
15.1M
  bf1[31] = bf0[31] + bf0[30];
596
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
597
598
  // stage 8
599
15.1M
  stage++;
600
15.1M
  cospi = cospi_arr(cos_bit);
601
15.1M
  bf0 = output;
602
15.1M
  bf1 = step;
603
15.1M
  bf1[0] = bf0[0];
604
15.1M
  bf1[1] = bf0[1];
605
15.1M
  bf1[2] = bf0[2];
606
15.1M
  bf1[3] = bf0[3];
607
15.1M
  bf1[4] = bf0[4];
608
15.1M
  bf1[5] = bf0[5];
609
15.1M
  bf1[6] = bf0[6];
610
15.1M
  bf1[7] = bf0[7];
611
15.1M
  bf1[8] = bf0[8];
612
15.1M
  bf1[9] = bf0[9];
613
15.1M
  bf1[10] = bf0[10];
614
15.1M
  bf1[11] = bf0[11];
615
15.1M
  bf1[12] = bf0[12];
616
15.1M
  bf1[13] = bf0[13];
617
15.1M
  bf1[14] = bf0[14];
618
15.1M
  bf1[15] = bf0[15];
619
15.1M
  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
620
15.1M
  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
621
15.1M
  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
622
15.1M
  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
623
15.1M
  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
624
15.1M
  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
625
15.1M
  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
626
15.1M
  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
627
15.1M
  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
628
15.1M
  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
629
15.1M
  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
630
15.1M
  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
631
15.1M
  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
632
15.1M
  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
633
15.1M
  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
634
15.1M
  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
635
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
636
637
  // stage 9
638
15.1M
  stage++;
639
15.1M
  bf0 = step;
640
15.1M
  bf1 = output;
641
15.1M
  bf1[0] = bf0[0];
642
15.1M
  bf1[1] = bf0[16];
643
15.1M
  bf1[2] = bf0[8];
644
15.1M
  bf1[3] = bf0[24];
645
15.1M
  bf1[4] = bf0[4];
646
15.1M
  bf1[5] = bf0[20];
647
15.1M
  bf1[6] = bf0[12];
648
15.1M
  bf1[7] = bf0[28];
649
15.1M
  bf1[8] = bf0[2];
650
15.1M
  bf1[9] = bf0[18];
651
15.1M
  bf1[10] = bf0[10];
652
15.1M
  bf1[11] = bf0[26];
653
15.1M
  bf1[12] = bf0[6];
654
15.1M
  bf1[13] = bf0[22];
655
15.1M
  bf1[14] = bf0[14];
656
15.1M
  bf1[15] = bf0[30];
657
15.1M
  bf1[16] = bf0[1];
658
15.1M
  bf1[17] = bf0[17];
659
15.1M
  bf1[18] = bf0[9];
660
15.1M
  bf1[19] = bf0[25];
661
15.1M
  bf1[20] = bf0[5];
662
15.1M
  bf1[21] = bf0[21];
663
15.1M
  bf1[22] = bf0[13];
664
15.1M
  bf1[23] = bf0[29];
665
15.1M
  bf1[24] = bf0[3];
666
15.1M
  bf1[25] = bf0[19];
667
15.1M
  bf1[26] = bf0[11];
668
15.1M
  bf1[27] = bf0[27];
669
15.1M
  bf1[28] = bf0[7];
670
15.1M
  bf1[29] = bf0[23];
671
15.1M
  bf1[30] = bf0[15];
672
15.1M
  bf1[31] = bf0[31];
673
15.1M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
674
15.1M
}
675
676
void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
677
181k
                const int8_t *stage_range) {
678
181k
  int bit = cos_bit;
679
181k
  const int32_t *sinpi = sinpi_arr(bit);
680
181k
  int32_t x0, x1, x2, x3;
681
181k
  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
682
683
  // stage 0
684
181k
  av1_range_check_buf(0, input, input, 4, stage_range[0]);
685
181k
  x0 = input[0];
686
181k
  x1 = input[1];
687
181k
  x2 = input[2];
688
181k
  x3 = input[3];
689
690
181k
  if (!(x0 | x1 | x2 | x3)) {
691
97.1k
    output[0] = output[1] = output[2] = output[3] = 0;
692
97.1k
    return;
693
97.1k
  }
694
695
  // stage 1
696
84.4k
  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
697
84.4k
  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
698
84.4k
  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
699
84.4k
  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
700
84.4k
  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
701
84.4k
  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
702
84.4k
  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
703
84.4k
  s7 = range_check_value(x0 + x1, stage_range[1]);
704
705
  // stage 2
706
84.4k
  s7 = range_check_value(s7 - x3, stage_range[2]);
707
708
  // stage 3
709
84.4k
  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
710
84.4k
  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
711
84.4k
  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
712
84.4k
  x3 = range_check_value(s4, bit + stage_range[3]);
713
714
  // stage 4
715
84.4k
  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
716
84.4k
  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
717
718
  // stage 5
719
84.4k
  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
720
84.4k
  s1 = range_check_value(x1, bit + stage_range[5]);
721
84.4k
  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
722
84.4k
  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
723
724
  // stage 6
725
84.4k
  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
726
727
  // 1-D transform scaling factor is sqrt(2).
728
84.4k
  output[0] = round_shift(s0, bit);
729
84.4k
  output[1] = round_shift(s1, bit);
730
84.4k
  output[2] = round_shift(s2, bit);
731
84.4k
  output[3] = round_shift(s3, bit);
732
84.4k
  av1_range_check_buf(6, input, output, 4, stage_range[6]);
733
84.4k
}
734
735
void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
736
2.08M
                const int8_t *stage_range) {
737
2.08M
  const int32_t size = 8;
738
2.08M
  const int32_t *cospi;
739
740
2.08M
  int32_t stage = 0;
741
2.08M
  int32_t *bf0, *bf1;
742
2.08M
  int32_t step[8];
743
744
  // stage 0;
745
2.08M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
746
747
  // stage 1;
748
2.08M
  stage++;
749
2.08M
  assert(output != input);
750
2.08M
  bf1 = output;
751
2.08M
  bf1[0] = input[0];
752
2.08M
  bf1[1] = -input[7];
753
2.08M
  bf1[2] = -input[3];
754
2.08M
  bf1[3] = input[4];
755
2.08M
  bf1[4] = -input[1];
756
2.08M
  bf1[5] = input[6];
757
2.08M
  bf1[6] = input[2];
758
2.08M
  bf1[7] = -input[5];
759
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
760
761
  // stage 2
762
2.08M
  stage++;
763
2.08M
  cospi = cospi_arr(cos_bit);
764
2.08M
  bf0 = output;
765
2.08M
  bf1 = step;
766
2.08M
  bf1[0] = bf0[0];
767
2.08M
  bf1[1] = bf0[1];
768
2.08M
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
769
2.08M
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
770
2.08M
  bf1[4] = bf0[4];
771
2.08M
  bf1[5] = bf0[5];
772
2.08M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
773
2.08M
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
774
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
775
776
  // stage 3
777
2.08M
  stage++;
778
2.08M
  bf0 = step;
779
2.08M
  bf1 = output;
780
2.08M
  bf1[0] = bf0[0] + bf0[2];
781
2.08M
  bf1[1] = bf0[1] + bf0[3];
782
2.08M
  bf1[2] = bf0[0] - bf0[2];
783
2.08M
  bf1[3] = bf0[1] - bf0[3];
784
2.08M
  bf1[4] = bf0[4] + bf0[6];
785
2.08M
  bf1[5] = bf0[5] + bf0[7];
786
2.08M
  bf1[6] = bf0[4] - bf0[6];
787
2.08M
  bf1[7] = bf0[5] - bf0[7];
788
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
789
790
  // stage 4
791
2.08M
  stage++;
792
2.08M
  cospi = cospi_arr(cos_bit);
793
2.08M
  bf0 = output;
794
2.08M
  bf1 = step;
795
2.08M
  bf1[0] = bf0[0];
796
2.08M
  bf1[1] = bf0[1];
797
2.08M
  bf1[2] = bf0[2];
798
2.08M
  bf1[3] = bf0[3];
799
2.08M
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
800
2.08M
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
801
2.08M
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
802
2.08M
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
803
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
804
805
  // stage 5
806
2.08M
  stage++;
807
2.08M
  bf0 = step;
808
2.08M
  bf1 = output;
809
2.08M
  bf1[0] = bf0[0] + bf0[4];
810
2.08M
  bf1[1] = bf0[1] + bf0[5];
811
2.08M
  bf1[2] = bf0[2] + bf0[6];
812
2.08M
  bf1[3] = bf0[3] + bf0[7];
813
2.08M
  bf1[4] = bf0[0] - bf0[4];
814
2.08M
  bf1[5] = bf0[1] - bf0[5];
815
2.08M
  bf1[6] = bf0[2] - bf0[6];
816
2.08M
  bf1[7] = bf0[3] - bf0[7];
817
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
818
819
  // stage 6
820
2.08M
  stage++;
821
2.08M
  cospi = cospi_arr(cos_bit);
822
2.08M
  bf0 = output;
823
2.08M
  bf1 = step;
824
2.08M
  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
825
2.08M
  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
826
2.08M
  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
827
2.08M
  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
828
2.08M
  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
829
2.08M
  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
830
2.08M
  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
831
2.08M
  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
832
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
833
834
  // stage 7
835
2.08M
  stage++;
836
2.08M
  bf0 = step;
837
2.08M
  bf1 = output;
838
2.08M
  bf1[0] = bf0[1];
839
2.08M
  bf1[1] = bf0[6];
840
2.08M
  bf1[2] = bf0[3];
841
2.08M
  bf1[3] = bf0[4];
842
2.08M
  bf1[4] = bf0[5];
843
2.08M
  bf1[5] = bf0[2];
844
2.08M
  bf1[6] = bf0[7];
845
2.08M
  bf1[7] = bf0[0];
846
2.08M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
847
2.08M
}
848
849
void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
850
2.83M
                 const int8_t *stage_range) {
851
2.83M
  const int32_t size = 16;
852
2.83M
  const int32_t *cospi;
853
854
2.83M
  int32_t stage = 0;
855
2.83M
  int32_t *bf0, *bf1;
856
2.83M
  int32_t step[16];
857
858
  // stage 0;
859
2.83M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
860
861
  // stage 1;
862
2.83M
  stage++;
863
2.83M
  assert(output != input);
864
2.83M
  bf1 = output;
865
2.83M
  bf1[0] = input[0];
866
2.83M
  bf1[1] = -input[15];
867
2.83M
  bf1[2] = -input[7];
868
2.83M
  bf1[3] = input[8];
869
2.83M
  bf1[4] = -input[3];
870
2.83M
  bf1[5] = input[12];
871
2.83M
  bf1[6] = input[4];
872
2.83M
  bf1[7] = -input[11];
873
2.83M
  bf1[8] = -input[1];
874
2.83M
  bf1[9] = input[14];
875
2.83M
  bf1[10] = input[6];
876
2.83M
  bf1[11] = -input[9];
877
2.83M
  bf1[12] = input[2];
878
2.83M
  bf1[13] = -input[13];
879
2.83M
  bf1[14] = -input[5];
880
2.83M
  bf1[15] = input[10];
881
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
882
883
  // stage 2
884
2.83M
  stage++;
885
2.83M
  cospi = cospi_arr(cos_bit);
886
2.83M
  bf0 = output;
887
2.83M
  bf1 = step;
888
2.83M
  bf1[0] = bf0[0];
889
2.83M
  bf1[1] = bf0[1];
890
2.83M
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
891
2.83M
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
892
2.83M
  bf1[4] = bf0[4];
893
2.83M
  bf1[5] = bf0[5];
894
2.83M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
895
2.83M
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
896
2.83M
  bf1[8] = bf0[8];
897
2.83M
  bf1[9] = bf0[9];
898
2.83M
  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
899
2.83M
  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
900
2.83M
  bf1[12] = bf0[12];
901
2.83M
  bf1[13] = bf0[13];
902
2.83M
  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
903
2.83M
  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
904
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
905
906
  // stage 3
907
2.83M
  stage++;
908
2.83M
  bf0 = step;
909
2.83M
  bf1 = output;
910
2.83M
  bf1[0] = bf0[0] + bf0[2];
911
2.83M
  bf1[1] = bf0[1] + bf0[3];
912
2.83M
  bf1[2] = bf0[0] - bf0[2];
913
2.83M
  bf1[3] = bf0[1] - bf0[3];
914
2.83M
  bf1[4] = bf0[4] + bf0[6];
915
2.83M
  bf1[5] = bf0[5] + bf0[7];
916
2.83M
  bf1[6] = bf0[4] - bf0[6];
917
2.83M
  bf1[7] = bf0[5] - bf0[7];
918
2.83M
  bf1[8] = bf0[8] + bf0[10];
919
2.83M
  bf1[9] = bf0[9] + bf0[11];
920
2.83M
  bf1[10] = bf0[8] - bf0[10];
921
2.83M
  bf1[11] = bf0[9] - bf0[11];
922
2.83M
  bf1[12] = bf0[12] + bf0[14];
923
2.83M
  bf1[13] = bf0[13] + bf0[15];
924
2.83M
  bf1[14] = bf0[12] - bf0[14];
925
2.83M
  bf1[15] = bf0[13] - bf0[15];
926
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
927
928
  // stage 4
929
2.83M
  stage++;
930
2.83M
  cospi = cospi_arr(cos_bit);
931
2.83M
  bf0 = output;
932
2.83M
  bf1 = step;
933
2.83M
  bf1[0] = bf0[0];
934
2.83M
  bf1[1] = bf0[1];
935
2.83M
  bf1[2] = bf0[2];
936
2.83M
  bf1[3] = bf0[3];
937
2.83M
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
938
2.83M
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
939
2.83M
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
940
2.83M
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
941
2.83M
  bf1[8] = bf0[8];
942
2.83M
  bf1[9] = bf0[9];
943
2.83M
  bf1[10] = bf0[10];
944
2.83M
  bf1[11] = bf0[11];
945
2.83M
  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
946
2.83M
  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
947
2.83M
  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
948
2.83M
  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
949
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
950
951
  // stage 5
952
2.83M
  stage++;
953
2.83M
  bf0 = step;
954
2.83M
  bf1 = output;
955
2.83M
  bf1[0] = bf0[0] + bf0[4];
956
2.83M
  bf1[1] = bf0[1] + bf0[5];
957
2.83M
  bf1[2] = bf0[2] + bf0[6];
958
2.83M
  bf1[3] = bf0[3] + bf0[7];
959
2.83M
  bf1[4] = bf0[0] - bf0[4];
960
2.83M
  bf1[5] = bf0[1] - bf0[5];
961
2.83M
  bf1[6] = bf0[2] - bf0[6];
962
2.83M
  bf1[7] = bf0[3] - bf0[7];
963
2.83M
  bf1[8] = bf0[8] + bf0[12];
964
2.83M
  bf1[9] = bf0[9] + bf0[13];
965
2.83M
  bf1[10] = bf0[10] + bf0[14];
966
2.83M
  bf1[11] = bf0[11] + bf0[15];
967
2.83M
  bf1[12] = bf0[8] - bf0[12];
968
2.83M
  bf1[13] = bf0[9] - bf0[13];
969
2.83M
  bf1[14] = bf0[10] - bf0[14];
970
2.83M
  bf1[15] = bf0[11] - bf0[15];
971
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
972
973
  // stage 6
974
2.83M
  stage++;
975
2.83M
  cospi = cospi_arr(cos_bit);
976
2.83M
  bf0 = output;
977
2.83M
  bf1 = step;
978
2.83M
  bf1[0] = bf0[0];
979
2.83M
  bf1[1] = bf0[1];
980
2.83M
  bf1[2] = bf0[2];
981
2.83M
  bf1[3] = bf0[3];
982
2.83M
  bf1[4] = bf0[4];
983
2.83M
  bf1[5] = bf0[5];
984
2.83M
  bf1[6] = bf0[6];
985
2.83M
  bf1[7] = bf0[7];
986
2.83M
  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
987
2.83M
  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
988
2.83M
  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
989
2.83M
  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
990
2.83M
  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
991
2.83M
  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
992
2.83M
  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
993
2.83M
  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
994
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
995
996
  // stage 7
997
2.83M
  stage++;
998
2.83M
  bf0 = step;
999
2.83M
  bf1 = output;
1000
2.83M
  bf1[0] = bf0[0] + bf0[8];
1001
2.83M
  bf1[1] = bf0[1] + bf0[9];
1002
2.83M
  bf1[2] = bf0[2] + bf0[10];
1003
2.83M
  bf1[3] = bf0[3] + bf0[11];
1004
2.83M
  bf1[4] = bf0[4] + bf0[12];
1005
2.83M
  bf1[5] = bf0[5] + bf0[13];
1006
2.83M
  bf1[6] = bf0[6] + bf0[14];
1007
2.83M
  bf1[7] = bf0[7] + bf0[15];
1008
2.83M
  bf1[8] = bf0[0] - bf0[8];
1009
2.83M
  bf1[9] = bf0[1] - bf0[9];
1010
2.83M
  bf1[10] = bf0[2] - bf0[10];
1011
2.83M
  bf1[11] = bf0[3] - bf0[11];
1012
2.83M
  bf1[12] = bf0[4] - bf0[12];
1013
2.83M
  bf1[13] = bf0[5] - bf0[13];
1014
2.83M
  bf1[14] = bf0[6] - bf0[14];
1015
2.83M
  bf1[15] = bf0[7] - bf0[15];
1016
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1017
1018
  // stage 8
1019
2.83M
  stage++;
1020
2.83M
  cospi = cospi_arr(cos_bit);
1021
2.83M
  bf0 = output;
1022
2.83M
  bf1 = step;
1023
2.83M
  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1024
2.83M
  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1025
2.83M
  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1026
2.83M
  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1027
2.83M
  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1028
2.83M
  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1029
2.83M
  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1030
2.83M
  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1031
2.83M
  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1032
2.83M
  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1033
2.83M
  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1034
2.83M
  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1035
2.83M
  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1036
2.83M
  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1037
2.83M
  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1038
2.83M
  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1039
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1040
1041
  // stage 9
1042
2.83M
  stage++;
1043
2.83M
  bf0 = step;
1044
2.83M
  bf1 = output;
1045
2.83M
  bf1[0] = bf0[1];
1046
2.83M
  bf1[1] = bf0[14];
1047
2.83M
  bf1[2] = bf0[3];
1048
2.83M
  bf1[3] = bf0[12];
1049
2.83M
  bf1[4] = bf0[5];
1050
2.83M
  bf1[5] = bf0[10];
1051
2.83M
  bf1[6] = bf0[7];
1052
2.83M
  bf1[7] = bf0[8];
1053
2.83M
  bf1[8] = bf0[9];
1054
2.83M
  bf1[9] = bf0[6];
1055
2.83M
  bf1[10] = bf0[11];
1056
2.83M
  bf1[11] = bf0[4];
1057
2.83M
  bf1[12] = bf0[13];
1058
2.83M
  bf1[13] = bf0[2];
1059
2.83M
  bf1[14] = bf0[15];
1060
2.83M
  bf1[15] = bf0[0];
1061
2.83M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1062
2.83M
}
1063
1064
void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1065
0
                      const int8_t *stage_range) {
1066
0
  (void)cos_bit;
1067
0
  for (int i = 0; i < 4; ++i)
1068
0
    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
1069
0
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1070
0
  av1_range_check_buf(0, input, output, 4, stage_range[0]);
1071
0
}
1072
1073
void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1074
0
                      const int8_t *stage_range) {
1075
0
  (void)cos_bit;
1076
0
  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1077
0
  av1_range_check_buf(0, input, output, 8, stage_range[0]);
1078
0
}
1079
1080
void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1081
0
                       const int8_t *stage_range) {
1082
0
  (void)cos_bit;
1083
0
  for (int i = 0; i < 16; ++i)
1084
0
    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
1085
0
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1086
0
  av1_range_check_buf(0, input, output, 16, stage_range[0]);
1087
0
}
1088
1089
void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1090
0
                       const int8_t *stage_range) {
1091
0
  (void)cos_bit;
1092
0
  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1093
0
  av1_range_check_buf(0, input, output, 32, stage_range[0]);
1094
0
}
1095
1096
void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
1097
0
                const int8_t *stage_range) {
1098
0
  const int32_t size = 64;
1099
0
  const int32_t *cospi;
1100
1101
0
  int32_t stage = 0;
1102
0
  int32_t *bf0, *bf1;
1103
0
  int32_t step[64];
1104
1105
  // stage 0;
1106
0
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
1107
1108
  // stage 1;
1109
0
  stage++;
1110
0
  bf1 = output;
1111
0
  bf1[0] = input[0] + input[63];
1112
0
  bf1[1] = input[1] + input[62];
1113
0
  bf1[2] = input[2] + input[61];
1114
0
  bf1[3] = input[3] + input[60];
1115
0
  bf1[4] = input[4] + input[59];
1116
0
  bf1[5] = input[5] + input[58];
1117
0
  bf1[6] = input[6] + input[57];
1118
0
  bf1[7] = input[7] + input[56];
1119
0
  bf1[8] = input[8] + input[55];
1120
0
  bf1[9] = input[9] + input[54];
1121
0
  bf1[10] = input[10] + input[53];
1122
0
  bf1[11] = input[11] + input[52];
1123
0
  bf1[12] = input[12] + input[51];
1124
0
  bf1[13] = input[13] + input[50];
1125
0
  bf1[14] = input[14] + input[49];
1126
0
  bf1[15] = input[15] + input[48];
1127
0
  bf1[16] = input[16] + input[47];
1128
0
  bf1[17] = input[17] + input[46];
1129
0
  bf1[18] = input[18] + input[45];
1130
0
  bf1[19] = input[19] + input[44];
1131
0
  bf1[20] = input[20] + input[43];
1132
0
  bf1[21] = input[21] + input[42];
1133
0
  bf1[22] = input[22] + input[41];
1134
0
  bf1[23] = input[23] + input[40];
1135
0
  bf1[24] = input[24] + input[39];
1136
0
  bf1[25] = input[25] + input[38];
1137
0
  bf1[26] = input[26] + input[37];
1138
0
  bf1[27] = input[27] + input[36];
1139
0
  bf1[28] = input[28] + input[35];
1140
0
  bf1[29] = input[29] + input[34];
1141
0
  bf1[30] = input[30] + input[33];
1142
0
  bf1[31] = input[31] + input[32];
1143
0
  bf1[32] = -input[32] + input[31];
1144
0
  bf1[33] = -input[33] + input[30];
1145
0
  bf1[34] = -input[34] + input[29];
1146
0
  bf1[35] = -input[35] + input[28];
1147
0
  bf1[36] = -input[36] + input[27];
1148
0
  bf1[37] = -input[37] + input[26];
1149
0
  bf1[38] = -input[38] + input[25];
1150
0
  bf1[39] = -input[39] + input[24];
1151
0
  bf1[40] = -input[40] + input[23];
1152
0
  bf1[41] = -input[41] + input[22];
1153
0
  bf1[42] = -input[42] + input[21];
1154
0
  bf1[43] = -input[43] + input[20];
1155
0
  bf1[44] = -input[44] + input[19];
1156
0
  bf1[45] = -input[45] + input[18];
1157
0
  bf1[46] = -input[46] + input[17];
1158
0
  bf1[47] = -input[47] + input[16];
1159
0
  bf1[48] = -input[48] + input[15];
1160
0
  bf1[49] = -input[49] + input[14];
1161
0
  bf1[50] = -input[50] + input[13];
1162
0
  bf1[51] = -input[51] + input[12];
1163
0
  bf1[52] = -input[52] + input[11];
1164
0
  bf1[53] = -input[53] + input[10];
1165
0
  bf1[54] = -input[54] + input[9];
1166
0
  bf1[55] = -input[55] + input[8];
1167
0
  bf1[56] = -input[56] + input[7];
1168
0
  bf1[57] = -input[57] + input[6];
1169
0
  bf1[58] = -input[58] + input[5];
1170
0
  bf1[59] = -input[59] + input[4];
1171
0
  bf1[60] = -input[60] + input[3];
1172
0
  bf1[61] = -input[61] + input[2];
1173
0
  bf1[62] = -input[62] + input[1];
1174
0
  bf1[63] = -input[63] + input[0];
1175
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1176
1177
  // stage 2
1178
0
  stage++;
1179
0
  cospi = cospi_arr(cos_bit);
1180
0
  bf0 = output;
1181
0
  bf1 = step;
1182
0
  bf1[0] = bf0[0] + bf0[31];
1183
0
  bf1[1] = bf0[1] + bf0[30];
1184
0
  bf1[2] = bf0[2] + bf0[29];
1185
0
  bf1[3] = bf0[3] + bf0[28];
1186
0
  bf1[4] = bf0[4] + bf0[27];
1187
0
  bf1[5] = bf0[5] + bf0[26];
1188
0
  bf1[6] = bf0[6] + bf0[25];
1189
0
  bf1[7] = bf0[7] + bf0[24];
1190
0
  bf1[8] = bf0[8] + bf0[23];
1191
0
  bf1[9] = bf0[9] + bf0[22];
1192
0
  bf1[10] = bf0[10] + bf0[21];
1193
0
  bf1[11] = bf0[11] + bf0[20];
1194
0
  bf1[12] = bf0[12] + bf0[19];
1195
0
  bf1[13] = bf0[13] + bf0[18];
1196
0
  bf1[14] = bf0[14] + bf0[17];
1197
0
  bf1[15] = bf0[15] + bf0[16];
1198
0
  bf1[16] = -bf0[16] + bf0[15];
1199
0
  bf1[17] = -bf0[17] + bf0[14];
1200
0
  bf1[18] = -bf0[18] + bf0[13];
1201
0
  bf1[19] = -bf0[19] + bf0[12];
1202
0
  bf1[20] = -bf0[20] + bf0[11];
1203
0
  bf1[21] = -bf0[21] + bf0[10];
1204
0
  bf1[22] = -bf0[22] + bf0[9];
1205
0
  bf1[23] = -bf0[23] + bf0[8];
1206
0
  bf1[24] = -bf0[24] + bf0[7];
1207
0
  bf1[25] = -bf0[25] + bf0[6];
1208
0
  bf1[26] = -bf0[26] + bf0[5];
1209
0
  bf1[27] = -bf0[27] + bf0[4];
1210
0
  bf1[28] = -bf0[28] + bf0[3];
1211
0
  bf1[29] = -bf0[29] + bf0[2];
1212
0
  bf1[30] = -bf0[30] + bf0[1];
1213
0
  bf1[31] = -bf0[31] + bf0[0];
1214
0
  bf1[32] = bf0[32];
1215
0
  bf1[33] = bf0[33];
1216
0
  bf1[34] = bf0[34];
1217
0
  bf1[35] = bf0[35];
1218
0
  bf1[36] = bf0[36];
1219
0
  bf1[37] = bf0[37];
1220
0
  bf1[38] = bf0[38];
1221
0
  bf1[39] = bf0[39];
1222
0
  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1223
0
  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1224
0
  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1225
0
  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1226
0
  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1227
0
  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1228
0
  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1229
0
  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1230
0
  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
1231
0
  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
1232
0
  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
1233
0
  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
1234
0
  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
1235
0
  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
1236
0
  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
1237
0
  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
1238
0
  bf1[56] = bf0[56];
1239
0
  bf1[57] = bf0[57];
1240
0
  bf1[58] = bf0[58];
1241
0
  bf1[59] = bf0[59];
1242
0
  bf1[60] = bf0[60];
1243
0
  bf1[61] = bf0[61];
1244
0
  bf1[62] = bf0[62];
1245
0
  bf1[63] = bf0[63];
1246
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1247
1248
  // stage 3
1249
0
  stage++;
1250
0
  cospi = cospi_arr(cos_bit);
1251
0
  bf0 = step;
1252
0
  bf1 = output;
1253
0
  bf1[0] = bf0[0] + bf0[15];
1254
0
  bf1[1] = bf0[1] + bf0[14];
1255
0
  bf1[2] = bf0[2] + bf0[13];
1256
0
  bf1[3] = bf0[3] + bf0[12];
1257
0
  bf1[4] = bf0[4] + bf0[11];
1258
0
  bf1[5] = bf0[5] + bf0[10];
1259
0
  bf1[6] = bf0[6] + bf0[9];
1260
0
  bf1[7] = bf0[7] + bf0[8];
1261
0
  bf1[8] = -bf0[8] + bf0[7];
1262
0
  bf1[9] = -bf0[9] + bf0[6];
1263
0
  bf1[10] = -bf0[10] + bf0[5];
1264
0
  bf1[11] = -bf0[11] + bf0[4];
1265
0
  bf1[12] = -bf0[12] + bf0[3];
1266
0
  bf1[13] = -bf0[13] + bf0[2];
1267
0
  bf1[14] = -bf0[14] + bf0[1];
1268
0
  bf1[15] = -bf0[15] + bf0[0];
1269
0
  bf1[16] = bf0[16];
1270
0
  bf1[17] = bf0[17];
1271
0
  bf1[18] = bf0[18];
1272
0
  bf1[19] = bf0[19];
1273
0
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1274
0
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1275
0
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1276
0
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1277
0
  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1278
0
  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1279
0
  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1280
0
  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1281
0
  bf1[28] = bf0[28];
1282
0
  bf1[29] = bf0[29];
1283
0
  bf1[30] = bf0[30];
1284
0
  bf1[31] = bf0[31];
1285
0
  bf1[32] = bf0[32] + bf0[47];
1286
0
  bf1[33] = bf0[33] + bf0[46];
1287
0
  bf1[34] = bf0[34] + bf0[45];
1288
0
  bf1[35] = bf0[35] + bf0[44];
1289
0
  bf1[36] = bf0[36] + bf0[43];
1290
0
  bf1[37] = bf0[37] + bf0[42];
1291
0
  bf1[38] = bf0[38] + bf0[41];
1292
0
  bf1[39] = bf0[39] + bf0[40];
1293
0
  bf1[40] = -bf0[40] + bf0[39];
1294
0
  bf1[41] = -bf0[41] + bf0[38];
1295
0
  bf1[42] = -bf0[42] + bf0[37];
1296
0
  bf1[43] = -bf0[43] + bf0[36];
1297
0
  bf1[44] = -bf0[44] + bf0[35];
1298
0
  bf1[45] = -bf0[45] + bf0[34];
1299
0
  bf1[46] = -bf0[46] + bf0[33];
1300
0
  bf1[47] = -bf0[47] + bf0[32];
1301
0
  bf1[48] = -bf0[48] + bf0[63];
1302
0
  bf1[49] = -bf0[49] + bf0[62];
1303
0
  bf1[50] = -bf0[50] + bf0[61];
1304
0
  bf1[51] = -bf0[51] + bf0[60];
1305
0
  bf1[52] = -bf0[52] + bf0[59];
1306
0
  bf1[53] = -bf0[53] + bf0[58];
1307
0
  bf1[54] = -bf0[54] + bf0[57];
1308
0
  bf1[55] = -bf0[55] + bf0[56];
1309
0
  bf1[56] = bf0[56] + bf0[55];
1310
0
  bf1[57] = bf0[57] + bf0[54];
1311
0
  bf1[58] = bf0[58] + bf0[53];
1312
0
  bf1[59] = bf0[59] + bf0[52];
1313
0
  bf1[60] = bf0[60] + bf0[51];
1314
0
  bf1[61] = bf0[61] + bf0[50];
1315
0
  bf1[62] = bf0[62] + bf0[49];
1316
0
  bf1[63] = bf0[63] + bf0[48];
1317
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1318
1319
  // stage 4
1320
0
  stage++;
1321
0
  cospi = cospi_arr(cos_bit);
1322
0
  bf0 = output;
1323
0
  bf1 = step;
1324
0
  bf1[0] = bf0[0] + bf0[7];
1325
0
  bf1[1] = bf0[1] + bf0[6];
1326
0
  bf1[2] = bf0[2] + bf0[5];
1327
0
  bf1[3] = bf0[3] + bf0[4];
1328
0
  bf1[4] = -bf0[4] + bf0[3];
1329
0
  bf1[5] = -bf0[5] + bf0[2];
1330
0
  bf1[6] = -bf0[6] + bf0[1];
1331
0
  bf1[7] = -bf0[7] + bf0[0];
1332
0
  bf1[8] = bf0[8];
1333
0
  bf1[9] = bf0[9];
1334
0
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1335
0
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1336
0
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1337
0
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1338
0
  bf1[14] = bf0[14];
1339
0
  bf1[15] = bf0[15];
1340
0
  bf1[16] = bf0[16] + bf0[23];
1341
0
  bf1[17] = bf0[17] + bf0[22];
1342
0
  bf1[18] = bf0[18] + bf0[21];
1343
0
  bf1[19] = bf0[19] + bf0[20];
1344
0
  bf1[20] = -bf0[20] + bf0[19];
1345
0
  bf1[21] = -bf0[21] + bf0[18];
1346
0
  bf1[22] = -bf0[22] + bf0[17];
1347
0
  bf1[23] = -bf0[23] + bf0[16];
1348
0
  bf1[24] = -bf0[24] + bf0[31];
1349
0
  bf1[25] = -bf0[25] + bf0[30];
1350
0
  bf1[26] = -bf0[26] + bf0[29];
1351
0
  bf1[27] = -bf0[27] + bf0[28];
1352
0
  bf1[28] = bf0[28] + bf0[27];
1353
0
  bf1[29] = bf0[29] + bf0[26];
1354
0
  bf1[30] = bf0[30] + bf0[25];
1355
0
  bf1[31] = bf0[31] + bf0[24];
1356
0
  bf1[32] = bf0[32];
1357
0
  bf1[33] = bf0[33];
1358
0
  bf1[34] = bf0[34];
1359
0
  bf1[35] = bf0[35];
1360
0
  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1361
0
  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1362
0
  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1363
0
  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1364
0
  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1365
0
  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1366
0
  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1367
0
  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1368
0
  bf1[44] = bf0[44];
1369
0
  bf1[45] = bf0[45];
1370
0
  bf1[46] = bf0[46];
1371
0
  bf1[47] = bf0[47];
1372
0
  bf1[48] = bf0[48];
1373
0
  bf1[49] = bf0[49];
1374
0
  bf1[50] = bf0[50];
1375
0
  bf1[51] = bf0[51];
1376
0
  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
1377
0
  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
1378
0
  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
1379
0
  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
1380
0
  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
1381
0
  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
1382
0
  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
1383
0
  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
1384
0
  bf1[60] = bf0[60];
1385
0
  bf1[61] = bf0[61];
1386
0
  bf1[62] = bf0[62];
1387
0
  bf1[63] = bf0[63];
1388
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1389
1390
  // stage 5
1391
0
  stage++;
1392
0
  cospi = cospi_arr(cos_bit);
1393
0
  bf0 = step;
1394
0
  bf1 = output;
1395
0
  bf1[0] = bf0[0] + bf0[3];
1396
0
  bf1[1] = bf0[1] + bf0[2];
1397
0
  bf1[2] = -bf0[2] + bf0[1];
1398
0
  bf1[3] = -bf0[3] + bf0[0];
1399
0
  bf1[4] = bf0[4];
1400
0
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1401
0
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1402
0
  bf1[7] = bf0[7];
1403
0
  bf1[8] = bf0[8] + bf0[11];
1404
0
  bf1[9] = bf0[9] + bf0[10];
1405
0
  bf1[10] = -bf0[10] + bf0[9];
1406
0
  bf1[11] = -bf0[11] + bf0[8];
1407
0
  bf1[12] = -bf0[12] + bf0[15];
1408
0
  bf1[13] = -bf0[13] + bf0[14];
1409
0
  bf1[14] = bf0[14] + bf0[13];
1410
0
  bf1[15] = bf0[15] + bf0[12];
1411
0
  bf1[16] = bf0[16];
1412
0
  bf1[17] = bf0[17];
1413
0
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1414
0
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1415
0
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1416
0
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1417
0
  bf1[22] = bf0[22];
1418
0
  bf1[23] = bf0[23];
1419
0
  bf1[24] = bf0[24];
1420
0
  bf1[25] = bf0[25];
1421
0
  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1422
0
  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1423
0
  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1424
0
  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1425
0
  bf1[30] = bf0[30];
1426
0
  bf1[31] = bf0[31];
1427
0
  bf1[32] = bf0[32] + bf0[39];
1428
0
  bf1[33] = bf0[33] + bf0[38];
1429
0
  bf1[34] = bf0[34] + bf0[37];
1430
0
  bf1[35] = bf0[35] + bf0[36];
1431
0
  bf1[36] = -bf0[36] + bf0[35];
1432
0
  bf1[37] = -bf0[37] + bf0[34];
1433
0
  bf1[38] = -bf0[38] + bf0[33];
1434
0
  bf1[39] = -bf0[39] + bf0[32];
1435
0
  bf1[40] = -bf0[40] + bf0[47];
1436
0
  bf1[41] = -bf0[41] + bf0[46];
1437
0
  bf1[42] = -bf0[42] + bf0[45];
1438
0
  bf1[43] = -bf0[43] + bf0[44];
1439
0
  bf1[44] = bf0[44] + bf0[43];
1440
0
  bf1[45] = bf0[45] + bf0[42];
1441
0
  bf1[46] = bf0[46] + bf0[41];
1442
0
  bf1[47] = bf0[47] + bf0[40];
1443
0
  bf1[48] = bf0[48] + bf0[55];
1444
0
  bf1[49] = bf0[49] + bf0[54];
1445
0
  bf1[50] = bf0[50] + bf0[53];
1446
0
  bf1[51] = bf0[51] + bf0[52];
1447
0
  bf1[52] = -bf0[52] + bf0[51];
1448
0
  bf1[53] = -bf0[53] + bf0[50];
1449
0
  bf1[54] = -bf0[54] + bf0[49];
1450
0
  bf1[55] = -bf0[55] + bf0[48];
1451
0
  bf1[56] = -bf0[56] + bf0[63];
1452
0
  bf1[57] = -bf0[57] + bf0[62];
1453
0
  bf1[58] = -bf0[58] + bf0[61];
1454
0
  bf1[59] = -bf0[59] + bf0[60];
1455
0
  bf1[60] = bf0[60] + bf0[59];
1456
0
  bf1[61] = bf0[61] + bf0[58];
1457
0
  bf1[62] = bf0[62] + bf0[57];
1458
0
  bf1[63] = bf0[63] + bf0[56];
1459
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1460
1461
  // stage 6
1462
0
  stage++;
1463
0
  cospi = cospi_arr(cos_bit);
1464
0
  bf0 = output;
1465
0
  bf1 = step;
1466
0
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1467
0
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1468
0
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1469
0
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1470
0
  bf1[4] = bf0[4] + bf0[5];
1471
0
  bf1[5] = -bf0[5] + bf0[4];
1472
0
  bf1[6] = -bf0[6] + bf0[7];
1473
0
  bf1[7] = bf0[7] + bf0[6];
1474
0
  bf1[8] = bf0[8];
1475
0
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1476
0
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1477
0
  bf1[11] = bf0[11];
1478
0
  bf1[12] = bf0[12];
1479
0
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1480
0
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1481
0
  bf1[15] = bf0[15];
1482
0
  bf1[16] = bf0[16] + bf0[19];
1483
0
  bf1[17] = bf0[17] + bf0[18];
1484
0
  bf1[18] = -bf0[18] + bf0[17];
1485
0
  bf1[19] = -bf0[19] + bf0[16];
1486
0
  bf1[20] = -bf0[20] + bf0[23];
1487
0
  bf1[21] = -bf0[21] + bf0[22];
1488
0
  bf1[22] = bf0[22] + bf0[21];
1489
0
  bf1[23] = bf0[23] + bf0[20];
1490
0
  bf1[24] = bf0[24] + bf0[27];
1491
0
  bf1[25] = bf0[25] + bf0[26];
1492
0
  bf1[26] = -bf0[26] + bf0[25];
1493
0
  bf1[27] = -bf0[27] + bf0[24];
1494
0
  bf1[28] = -bf0[28] + bf0[31];
1495
0
  bf1[29] = -bf0[29] + bf0[30];
1496
0
  bf1[30] = bf0[30] + bf0[29];
1497
0
  bf1[31] = bf0[31] + bf0[28];
1498
0
  bf1[32] = bf0[32];
1499
0
  bf1[33] = bf0[33];
1500
0
  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1501
0
  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1502
0
  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1503
0
  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1504
0
  bf1[38] = bf0[38];
1505
0
  bf1[39] = bf0[39];
1506
0
  bf1[40] = bf0[40];
1507
0
  bf1[41] = bf0[41];
1508
0
  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1509
0
  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1510
0
  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1511
0
  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1512
0
  bf1[46] = bf0[46];
1513
0
  bf1[47] = bf0[47];
1514
0
  bf1[48] = bf0[48];
1515
0
  bf1[49] = bf0[49];
1516
0
  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1517
0
  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1518
0
  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1519
0
  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1520
0
  bf1[54] = bf0[54];
1521
0
  bf1[55] = bf0[55];
1522
0
  bf1[56] = bf0[56];
1523
0
  bf1[57] = bf0[57];
1524
0
  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1525
0
  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1526
0
  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1527
0
  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1528
0
  bf1[62] = bf0[62];
1529
0
  bf1[63] = bf0[63];
1530
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1531
1532
  // stage 7
1533
0
  stage++;
1534
0
  cospi = cospi_arr(cos_bit);
1535
0
  bf0 = step;
1536
0
  bf1 = output;
1537
0
  bf1[0] = bf0[0];
1538
0
  bf1[1] = bf0[1];
1539
0
  bf1[2] = bf0[2];
1540
0
  bf1[3] = bf0[3];
1541
0
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1542
0
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1543
0
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1544
0
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1545
0
  bf1[8] = bf0[8] + bf0[9];
1546
0
  bf1[9] = -bf0[9] + bf0[8];
1547
0
  bf1[10] = -bf0[10] + bf0[11];
1548
0
  bf1[11] = bf0[11] + bf0[10];
1549
0
  bf1[12] = bf0[12] + bf0[13];
1550
0
  bf1[13] = -bf0[13] + bf0[12];
1551
0
  bf1[14] = -bf0[14] + bf0[15];
1552
0
  bf1[15] = bf0[15] + bf0[14];
1553
0
  bf1[16] = bf0[16];
1554
0
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1555
0
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1556
0
  bf1[19] = bf0[19];
1557
0
  bf1[20] = bf0[20];
1558
0
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1559
0
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1560
0
  bf1[23] = bf0[23];
1561
0
  bf1[24] = bf0[24];
1562
0
  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1563
0
  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1564
0
  bf1[27] = bf0[27];
1565
0
  bf1[28] = bf0[28];
1566
0
  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1567
0
  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1568
0
  bf1[31] = bf0[31];
1569
0
  bf1[32] = bf0[32] + bf0[35];
1570
0
  bf1[33] = bf0[33] + bf0[34];
1571
0
  bf1[34] = -bf0[34] + bf0[33];
1572
0
  bf1[35] = -bf0[35] + bf0[32];
1573
0
  bf1[36] = -bf0[36] + bf0[39];
1574
0
  bf1[37] = -bf0[37] + bf0[38];
1575
0
  bf1[38] = bf0[38] + bf0[37];
1576
0
  bf1[39] = bf0[39] + bf0[36];
1577
0
  bf1[40] = bf0[40] + bf0[43];
1578
0
  bf1[41] = bf0[41] + bf0[42];
1579
0
  bf1[42] = -bf0[42] + bf0[41];
1580
0
  bf1[43] = -bf0[43] + bf0[40];
1581
0
  bf1[44] = -bf0[44] + bf0[47];
1582
0
  bf1[45] = -bf0[45] + bf0[46];
1583
0
  bf1[46] = bf0[46] + bf0[45];
1584
0
  bf1[47] = bf0[47] + bf0[44];
1585
0
  bf1[48] = bf0[48] + bf0[51];
1586
0
  bf1[49] = bf0[49] + bf0[50];
1587
0
  bf1[50] = -bf0[50] + bf0[49];
1588
0
  bf1[51] = -bf0[51] + bf0[48];
1589
0
  bf1[52] = -bf0[52] + bf0[55];
1590
0
  bf1[53] = -bf0[53] + bf0[54];
1591
0
  bf1[54] = bf0[54] + bf0[53];
1592
0
  bf1[55] = bf0[55] + bf0[52];
1593
0
  bf1[56] = bf0[56] + bf0[59];
1594
0
  bf1[57] = bf0[57] + bf0[58];
1595
0
  bf1[58] = -bf0[58] + bf0[57];
1596
0
  bf1[59] = -bf0[59] + bf0[56];
1597
0
  bf1[60] = -bf0[60] + bf0[63];
1598
0
  bf1[61] = -bf0[61] + bf0[62];
1599
0
  bf1[62] = bf0[62] + bf0[61];
1600
0
  bf1[63] = bf0[63] + bf0[60];
1601
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1602
1603
  // stage 8
1604
0
  stage++;
1605
0
  cospi = cospi_arr(cos_bit);
1606
0
  bf0 = output;
1607
0
  bf1 = step;
1608
0
  bf1[0] = bf0[0];
1609
0
  bf1[1] = bf0[1];
1610
0
  bf1[2] = bf0[2];
1611
0
  bf1[3] = bf0[3];
1612
0
  bf1[4] = bf0[4];
1613
0
  bf1[5] = bf0[5];
1614
0
  bf1[6] = bf0[6];
1615
0
  bf1[7] = bf0[7];
1616
0
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1617
0
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1618
0
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1619
0
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1620
0
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1621
0
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1622
0
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1623
0
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1624
0
  bf1[16] = bf0[16] + bf0[17];
1625
0
  bf1[17] = -bf0[17] + bf0[16];
1626
0
  bf1[18] = -bf0[18] + bf0[19];
1627
0
  bf1[19] = bf0[19] + bf0[18];
1628
0
  bf1[20] = bf0[20] + bf0[21];
1629
0
  bf1[21] = -bf0[21] + bf0[20];
1630
0
  bf1[22] = -bf0[22] + bf0[23];
1631
0
  bf1[23] = bf0[23] + bf0[22];
1632
0
  bf1[24] = bf0[24] + bf0[25];
1633
0
  bf1[25] = -bf0[25] + bf0[24];
1634
0
  bf1[26] = -bf0[26] + bf0[27];
1635
0
  bf1[27] = bf0[27] + bf0[26];
1636
0
  bf1[28] = bf0[28] + bf0[29];
1637
0
  bf1[29] = -bf0[29] + bf0[28];
1638
0
  bf1[30] = -bf0[30] + bf0[31];
1639
0
  bf1[31] = bf0[31] + bf0[30];
1640
0
  bf1[32] = bf0[32];
1641
0
  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1642
0
  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1643
0
  bf1[35] = bf0[35];
1644
0
  bf1[36] = bf0[36];
1645
0
  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1646
0
  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1647
0
  bf1[39] = bf0[39];
1648
0
  bf1[40] = bf0[40];
1649
0
  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1650
0
  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1651
0
  bf1[43] = bf0[43];
1652
0
  bf1[44] = bf0[44];
1653
0
  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1654
0
  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1655
0
  bf1[47] = bf0[47];
1656
0
  bf1[48] = bf0[48];
1657
0
  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1658
0
  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1659
0
  bf1[51] = bf0[51];
1660
0
  bf1[52] = bf0[52];
1661
0
  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1662
0
  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1663
0
  bf1[55] = bf0[55];
1664
0
  bf1[56] = bf0[56];
1665
0
  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1666
0
  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1667
0
  bf1[59] = bf0[59];
1668
0
  bf1[60] = bf0[60];
1669
0
  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1670
0
  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1671
0
  bf1[63] = bf0[63];
1672
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1673
1674
  // stage 9
1675
0
  stage++;
1676
0
  cospi = cospi_arr(cos_bit);
1677
0
  bf0 = step;
1678
0
  bf1 = output;
1679
0
  bf1[0] = bf0[0];
1680
0
  bf1[1] = bf0[1];
1681
0
  bf1[2] = bf0[2];
1682
0
  bf1[3] = bf0[3];
1683
0
  bf1[4] = bf0[4];
1684
0
  bf1[5] = bf0[5];
1685
0
  bf1[6] = bf0[6];
1686
0
  bf1[7] = bf0[7];
1687
0
  bf1[8] = bf0[8];
1688
0
  bf1[9] = bf0[9];
1689
0
  bf1[10] = bf0[10];
1690
0
  bf1[11] = bf0[11];
1691
0
  bf1[12] = bf0[12];
1692
0
  bf1[13] = bf0[13];
1693
0
  bf1[14] = bf0[14];
1694
0
  bf1[15] = bf0[15];
1695
0
  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1696
0
  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1697
0
  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1698
0
  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1699
0
  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1700
0
  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1701
0
  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1702
0
  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1703
0
  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1704
0
  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1705
0
  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1706
0
  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1707
0
  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1708
0
  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1709
0
  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1710
0
  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1711
0
  bf1[32] = bf0[32] + bf0[33];
1712
0
  bf1[33] = -bf0[33] + bf0[32];
1713
0
  bf1[34] = -bf0[34] + bf0[35];
1714
0
  bf1[35] = bf0[35] + bf0[34];
1715
0
  bf1[36] = bf0[36] + bf0[37];
1716
0
  bf1[37] = -bf0[37] + bf0[36];
1717
0
  bf1[38] = -bf0[38] + bf0[39];
1718
0
  bf1[39] = bf0[39] + bf0[38];
1719
0
  bf1[40] = bf0[40] + bf0[41];
1720
0
  bf1[41] = -bf0[41] + bf0[40];
1721
0
  bf1[42] = -bf0[42] + bf0[43];
1722
0
  bf1[43] = bf0[43] + bf0[42];
1723
0
  bf1[44] = bf0[44] + bf0[45];
1724
0
  bf1[45] = -bf0[45] + bf0[44];
1725
0
  bf1[46] = -bf0[46] + bf0[47];
1726
0
  bf1[47] = bf0[47] + bf0[46];
1727
0
  bf1[48] = bf0[48] + bf0[49];
1728
0
  bf1[49] = -bf0[49] + bf0[48];
1729
0
  bf1[50] = -bf0[50] + bf0[51];
1730
0
  bf1[51] = bf0[51] + bf0[50];
1731
0
  bf1[52] = bf0[52] + bf0[53];
1732
0
  bf1[53] = -bf0[53] + bf0[52];
1733
0
  bf1[54] = -bf0[54] + bf0[55];
1734
0
  bf1[55] = bf0[55] + bf0[54];
1735
0
  bf1[56] = bf0[56] + bf0[57];
1736
0
  bf1[57] = -bf0[57] + bf0[56];
1737
0
  bf1[58] = -bf0[58] + bf0[59];
1738
0
  bf1[59] = bf0[59] + bf0[58];
1739
0
  bf1[60] = bf0[60] + bf0[61];
1740
0
  bf1[61] = -bf0[61] + bf0[60];
1741
0
  bf1[62] = -bf0[62] + bf0[63];
1742
0
  bf1[63] = bf0[63] + bf0[62];
1743
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1744
1745
  // stage 10
1746
0
  stage++;
1747
0
  cospi = cospi_arr(cos_bit);
1748
0
  bf0 = output;
1749
0
  bf1 = step;
1750
0
  bf1[0] = bf0[0];
1751
0
  bf1[1] = bf0[1];
1752
0
  bf1[2] = bf0[2];
1753
0
  bf1[3] = bf0[3];
1754
0
  bf1[4] = bf0[4];
1755
0
  bf1[5] = bf0[5];
1756
0
  bf1[6] = bf0[6];
1757
0
  bf1[7] = bf0[7];
1758
0
  bf1[8] = bf0[8];
1759
0
  bf1[9] = bf0[9];
1760
0
  bf1[10] = bf0[10];
1761
0
  bf1[11] = bf0[11];
1762
0
  bf1[12] = bf0[12];
1763
0
  bf1[13] = bf0[13];
1764
0
  bf1[14] = bf0[14];
1765
0
  bf1[15] = bf0[15];
1766
0
  bf1[16] = bf0[16];
1767
0
  bf1[17] = bf0[17];
1768
0
  bf1[18] = bf0[18];
1769
0
  bf1[19] = bf0[19];
1770
0
  bf1[20] = bf0[20];
1771
0
  bf1[21] = bf0[21];
1772
0
  bf1[22] = bf0[22];
1773
0
  bf1[23] = bf0[23];
1774
0
  bf1[24] = bf0[24];
1775
0
  bf1[25] = bf0[25];
1776
0
  bf1[26] = bf0[26];
1777
0
  bf1[27] = bf0[27];
1778
0
  bf1[28] = bf0[28];
1779
0
  bf1[29] = bf0[29];
1780
0
  bf1[30] = bf0[30];
1781
0
  bf1[31] = bf0[31];
1782
0
  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1783
0
  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1784
0
  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1785
0
  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1786
0
  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1787
0
  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1788
0
  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1789
0
  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1790
0
  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1791
0
  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1792
0
  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1793
0
  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1794
0
  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1795
0
  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1796
0
  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1797
0
  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1798
0
  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1799
0
  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1800
0
  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1801
0
  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1802
0
  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1803
0
  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1804
0
  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1805
0
  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1806
0
  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1807
0
  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1808
0
  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1809
0
  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1810
0
  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1811
0
  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1812
0
  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1813
0
  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1814
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1815
1816
  // stage 11
1817
0
  stage++;
1818
0
  bf0 = step;
1819
0
  bf1 = output;
1820
0
  bf1[0] = bf0[0];
1821
0
  bf1[1] = bf0[32];
1822
0
  bf1[2] = bf0[16];
1823
0
  bf1[3] = bf0[48];
1824
0
  bf1[4] = bf0[8];
1825
0
  bf1[5] = bf0[40];
1826
0
  bf1[6] = bf0[24];
1827
0
  bf1[7] = bf0[56];
1828
0
  bf1[8] = bf0[4];
1829
0
  bf1[9] = bf0[36];
1830
0
  bf1[10] = bf0[20];
1831
0
  bf1[11] = bf0[52];
1832
0
  bf1[12] = bf0[12];
1833
0
  bf1[13] = bf0[44];
1834
0
  bf1[14] = bf0[28];
1835
0
  bf1[15] = bf0[60];
1836
0
  bf1[16] = bf0[2];
1837
0
  bf1[17] = bf0[34];
1838
0
  bf1[18] = bf0[18];
1839
0
  bf1[19] = bf0[50];
1840
0
  bf1[20] = bf0[10];
1841
0
  bf1[21] = bf0[42];
1842
0
  bf1[22] = bf0[26];
1843
0
  bf1[23] = bf0[58];
1844
0
  bf1[24] = bf0[6];
1845
0
  bf1[25] = bf0[38];
1846
0
  bf1[26] = bf0[22];
1847
0
  bf1[27] = bf0[54];
1848
0
  bf1[28] = bf0[14];
1849
0
  bf1[29] = bf0[46];
1850
0
  bf1[30] = bf0[30];
1851
0
  bf1[31] = bf0[62];
1852
0
  bf1[32] = bf0[1];
1853
0
  bf1[33] = bf0[33];
1854
0
  bf1[34] = bf0[17];
1855
0
  bf1[35] = bf0[49];
1856
0
  bf1[36] = bf0[9];
1857
0
  bf1[37] = bf0[41];
1858
0
  bf1[38] = bf0[25];
1859
0
  bf1[39] = bf0[57];
1860
0
  bf1[40] = bf0[5];
1861
0
  bf1[41] = bf0[37];
1862
0
  bf1[42] = bf0[21];
1863
0
  bf1[43] = bf0[53];
1864
0
  bf1[44] = bf0[13];
1865
0
  bf1[45] = bf0[45];
1866
0
  bf1[46] = bf0[29];
1867
0
  bf1[47] = bf0[61];
1868
0
  bf1[48] = bf0[3];
1869
0
  bf1[49] = bf0[35];
1870
0
  bf1[50] = bf0[19];
1871
0
  bf1[51] = bf0[51];
1872
0
  bf1[52] = bf0[11];
1873
0
  bf1[53] = bf0[43];
1874
0
  bf1[54] = bf0[27];
1875
0
  bf1[55] = bf0[59];
1876
0
  bf1[56] = bf0[7];
1877
0
  bf1[57] = bf0[39];
1878
0
  bf1[58] = bf0[23];
1879
0
  bf1[59] = bf0[55];
1880
0
  bf1[60] = bf0[15];
1881
0
  bf1[61] = bf0[47];
1882
0
  bf1[62] = bf0[31];
1883
0
  bf1[63] = bf0[63];
1884
0
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1885
0
}