Coverage Report

Created: 2022-08-24 06:17

/src/aom/av1/encoder/av1_fwd_txfm1d.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <stdlib.h>
13
#include "av1/encoder/av1_fwd_txfm1d.h"
14
#include "av1/common/av1_txfm.h"
15
16
void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
17
7.79M
               const int8_t *stage_range) {
18
7.79M
  const int32_t size = 4;
19
7.79M
  const int32_t *cospi;
20
21
7.79M
  int32_t stage = 0;
22
7.79M
  int32_t *bf0, *bf1;
23
7.79M
  int32_t step[4];
24
25
  // stage 0;
26
7.79M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
27
28
  // stage 1;
29
7.79M
  stage++;
30
7.79M
  bf1 = output;
31
7.79M
  bf1[0] = input[0] + input[3];
32
7.79M
  bf1[1] = input[1] + input[2];
33
7.79M
  bf1[2] = -input[2] + input[1];
34
7.79M
  bf1[3] = -input[3] + input[0];
35
7.79M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36
37
  // stage 2
38
7.79M
  stage++;
39
7.79M
  cospi = cospi_arr(cos_bit);
40
7.79M
  bf0 = output;
41
7.79M
  bf1 = step;
42
7.79M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
43
7.79M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
44
7.79M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
45
7.79M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
46
7.79M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
47
48
  // stage 3
49
7.79M
  stage++;
50
7.79M
  bf0 = step;
51
7.79M
  bf1 = output;
52
7.79M
  bf1[0] = bf0[0];
53
7.79M
  bf1[1] = bf0[2];
54
7.79M
  bf1[2] = bf0[1];
55
7.79M
  bf1[3] = bf0[3];
56
7.79M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
57
7.79M
}
58
59
void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
60
23.2M
               const int8_t *stage_range) {
61
23.2M
  const int32_t size = 8;
62
23.2M
  const int32_t *cospi;
63
64
23.2M
  int32_t stage = 0;
65
23.2M
  int32_t *bf0, *bf1;
66
23.2M
  int32_t step[8];
67
68
  // stage 0;
69
23.2M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
70
71
  // stage 1;
72
23.2M
  stage++;
73
23.2M
  bf1 = output;
74
23.2M
  bf1[0] = input[0] + input[7];
75
23.2M
  bf1[1] = input[1] + input[6];
76
23.2M
  bf1[2] = input[2] + input[5];
77
23.2M
  bf1[3] = input[3] + input[4];
78
23.2M
  bf1[4] = -input[4] + input[3];
79
23.2M
  bf1[5] = -input[5] + input[2];
80
23.2M
  bf1[6] = -input[6] + input[1];
81
23.2M
  bf1[7] = -input[7] + input[0];
82
23.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
83
84
  // stage 2
85
23.2M
  stage++;
86
23.2M
  cospi = cospi_arr(cos_bit);
87
23.2M
  bf0 = output;
88
23.2M
  bf1 = step;
89
23.2M
  bf1[0] = bf0[0] + bf0[3];
90
23.2M
  bf1[1] = bf0[1] + bf0[2];
91
23.2M
  bf1[2] = -bf0[2] + bf0[1];
92
23.2M
  bf1[3] = -bf0[3] + bf0[0];
93
23.2M
  bf1[4] = bf0[4];
94
23.2M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
95
23.2M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
96
23.2M
  bf1[7] = bf0[7];
97
23.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
98
99
  // stage 3
100
23.2M
  stage++;
101
23.2M
  cospi = cospi_arr(cos_bit);
102
23.2M
  bf0 = step;
103
23.2M
  bf1 = output;
104
23.2M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
105
23.2M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
106
23.2M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
107
23.2M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
108
23.2M
  bf1[4] = bf0[4] + bf0[5];
109
23.2M
  bf1[5] = -bf0[5] + bf0[4];
110
23.2M
  bf1[6] = -bf0[6] + bf0[7];
111
23.2M
  bf1[7] = bf0[7] + bf0[6];
112
23.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
113
114
  // stage 4
115
23.2M
  stage++;
116
23.2M
  cospi = cospi_arr(cos_bit);
117
23.2M
  bf0 = output;
118
23.2M
  bf1 = step;
119
23.2M
  bf1[0] = bf0[0];
120
23.2M
  bf1[1] = bf0[1];
121
23.2M
  bf1[2] = bf0[2];
122
23.2M
  bf1[3] = bf0[3];
123
23.2M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
124
23.2M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
125
23.2M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
126
23.2M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
127
23.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
128
129
  // stage 5
130
23.2M
  stage++;
131
23.2M
  bf0 = step;
132
23.2M
  bf1 = output;
133
23.2M
  bf1[0] = bf0[0];
134
23.2M
  bf1[1] = bf0[4];
135
23.2M
  bf1[2] = bf0[2];
136
23.2M
  bf1[3] = bf0[6];
137
23.2M
  bf1[4] = bf0[1];
138
23.2M
  bf1[5] = bf0[5];
139
23.2M
  bf1[6] = bf0[3];
140
23.2M
  bf1[7] = bf0[7];
141
23.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
142
23.2M
}
143
144
void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
145
28.3M
                const int8_t *stage_range) {
146
28.3M
  const int32_t size = 16;
147
28.3M
  const int32_t *cospi;
148
149
28.3M
  int32_t stage = 0;
150
28.3M
  int32_t *bf0, *bf1;
151
28.3M
  int32_t step[16];
152
153
  // stage 0;
154
28.3M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
155
156
  // stage 1;
157
28.3M
  stage++;
158
28.3M
  bf1 = output;
159
28.3M
  bf1[0] = input[0] + input[15];
160
28.3M
  bf1[1] = input[1] + input[14];
161
28.3M
  bf1[2] = input[2] + input[13];
162
28.3M
  bf1[3] = input[3] + input[12];
163
28.3M
  bf1[4] = input[4] + input[11];
164
28.3M
  bf1[5] = input[5] + input[10];
165
28.3M
  bf1[6] = input[6] + input[9];
166
28.3M
  bf1[7] = input[7] + input[8];
167
28.3M
  bf1[8] = -input[8] + input[7];
168
28.3M
  bf1[9] = -input[9] + input[6];
169
28.3M
  bf1[10] = -input[10] + input[5];
170
28.3M
  bf1[11] = -input[11] + input[4];
171
28.3M
  bf1[12] = -input[12] + input[3];
172
28.3M
  bf1[13] = -input[13] + input[2];
173
28.3M
  bf1[14] = -input[14] + input[1];
174
28.3M
  bf1[15] = -input[15] + input[0];
175
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
176
177
  // stage 2
178
28.3M
  stage++;
179
28.3M
  cospi = cospi_arr(cos_bit);
180
28.3M
  bf0 = output;
181
28.3M
  bf1 = step;
182
28.3M
  bf1[0] = bf0[0] + bf0[7];
183
28.3M
  bf1[1] = bf0[1] + bf0[6];
184
28.3M
  bf1[2] = bf0[2] + bf0[5];
185
28.3M
  bf1[3] = bf0[3] + bf0[4];
186
28.3M
  bf1[4] = -bf0[4] + bf0[3];
187
28.3M
  bf1[5] = -bf0[5] + bf0[2];
188
28.3M
  bf1[6] = -bf0[6] + bf0[1];
189
28.3M
  bf1[7] = -bf0[7] + bf0[0];
190
28.3M
  bf1[8] = bf0[8];
191
28.3M
  bf1[9] = bf0[9];
192
28.3M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
193
28.3M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
194
28.3M
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
195
28.3M
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
196
28.3M
  bf1[14] = bf0[14];
197
28.3M
  bf1[15] = bf0[15];
198
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
199
200
  // stage 3
201
28.3M
  stage++;
202
28.3M
  cospi = cospi_arr(cos_bit);
203
28.3M
  bf0 = step;
204
28.3M
  bf1 = output;
205
28.3M
  bf1[0] = bf0[0] + bf0[3];
206
28.3M
  bf1[1] = bf0[1] + bf0[2];
207
28.3M
  bf1[2] = -bf0[2] + bf0[1];
208
28.3M
  bf1[3] = -bf0[3] + bf0[0];
209
28.3M
  bf1[4] = bf0[4];
210
28.3M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
211
28.3M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
212
28.3M
  bf1[7] = bf0[7];
213
28.3M
  bf1[8] = bf0[8] + bf0[11];
214
28.3M
  bf1[9] = bf0[9] + bf0[10];
215
28.3M
  bf1[10] = -bf0[10] + bf0[9];
216
28.3M
  bf1[11] = -bf0[11] + bf0[8];
217
28.3M
  bf1[12] = -bf0[12] + bf0[15];
218
28.3M
  bf1[13] = -bf0[13] + bf0[14];
219
28.3M
  bf1[14] = bf0[14] + bf0[13];
220
28.3M
  bf1[15] = bf0[15] + bf0[12];
221
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
222
223
  // stage 4
224
28.3M
  stage++;
225
28.3M
  cospi = cospi_arr(cos_bit);
226
28.3M
  bf0 = output;
227
28.3M
  bf1 = step;
228
28.3M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
229
28.3M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
230
28.3M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
231
28.3M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
232
28.3M
  bf1[4] = bf0[4] + bf0[5];
233
28.3M
  bf1[5] = -bf0[5] + bf0[4];
234
28.3M
  bf1[6] = -bf0[6] + bf0[7];
235
28.3M
  bf1[7] = bf0[7] + bf0[6];
236
28.3M
  bf1[8] = bf0[8];
237
28.3M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
238
28.3M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
239
28.3M
  bf1[11] = bf0[11];
240
28.3M
  bf1[12] = bf0[12];
241
28.3M
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
242
28.3M
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
243
28.3M
  bf1[15] = bf0[15];
244
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
245
246
  // stage 5
247
28.3M
  stage++;
248
28.3M
  cospi = cospi_arr(cos_bit);
249
28.3M
  bf0 = step;
250
28.3M
  bf1 = output;
251
28.3M
  bf1[0] = bf0[0];
252
28.3M
  bf1[1] = bf0[1];
253
28.3M
  bf1[2] = bf0[2];
254
28.3M
  bf1[3] = bf0[3];
255
28.3M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
256
28.3M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
257
28.3M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
258
28.3M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
259
28.3M
  bf1[8] = bf0[8] + bf0[9];
260
28.3M
  bf1[9] = -bf0[9] + bf0[8];
261
28.3M
  bf1[10] = -bf0[10] + bf0[11];
262
28.3M
  bf1[11] = bf0[11] + bf0[10];
263
28.3M
  bf1[12] = bf0[12] + bf0[13];
264
28.3M
  bf1[13] = -bf0[13] + bf0[12];
265
28.3M
  bf1[14] = -bf0[14] + bf0[15];
266
28.3M
  bf1[15] = bf0[15] + bf0[14];
267
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
268
269
  // stage 6
270
28.3M
  stage++;
271
28.3M
  cospi = cospi_arr(cos_bit);
272
28.3M
  bf0 = output;
273
28.3M
  bf1 = step;
274
28.3M
  bf1[0] = bf0[0];
275
28.3M
  bf1[1] = bf0[1];
276
28.3M
  bf1[2] = bf0[2];
277
28.3M
  bf1[3] = bf0[3];
278
28.3M
  bf1[4] = bf0[4];
279
28.3M
  bf1[5] = bf0[5];
280
28.3M
  bf1[6] = bf0[6];
281
28.3M
  bf1[7] = bf0[7];
282
28.3M
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
283
28.3M
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
284
28.3M
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
285
28.3M
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
286
28.3M
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
287
28.3M
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
288
28.3M
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
289
28.3M
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
290
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
291
292
  // stage 7
293
28.3M
  stage++;
294
28.3M
  bf0 = step;
295
28.3M
  bf1 = output;
296
28.3M
  bf1[0] = bf0[0];
297
28.3M
  bf1[1] = bf0[8];
298
28.3M
  bf1[2] = bf0[4];
299
28.3M
  bf1[3] = bf0[12];
300
28.3M
  bf1[4] = bf0[2];
301
28.3M
  bf1[5] = bf0[10];
302
28.3M
  bf1[6] = bf0[6];
303
28.3M
  bf1[7] = bf0[14];
304
28.3M
  bf1[8] = bf0[1];
305
28.3M
  bf1[9] = bf0[9];
306
28.3M
  bf1[10] = bf0[5];
307
28.3M
  bf1[11] = bf0[13];
308
28.3M
  bf1[12] = bf0[3];
309
28.3M
  bf1[13] = bf0[11];
310
28.3M
  bf1[14] = bf0[7];
311
28.3M
  bf1[15] = bf0[15];
312
28.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
313
28.3M
}
314
315
void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
316
33.9M
                const int8_t *stage_range) {
317
33.9M
  const int32_t size = 32;
318
33.9M
  const int32_t *cospi;
319
320
33.9M
  int32_t stage = 0;
321
33.9M
  int32_t *bf0, *bf1;
322
33.9M
  int32_t step[32];
323
324
  // stage 0;
325
33.9M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
326
327
  // stage 1;
328
33.9M
  stage++;
329
33.9M
  bf1 = output;
330
33.9M
  bf1[0] = input[0] + input[31];
331
33.9M
  bf1[1] = input[1] + input[30];
332
33.9M
  bf1[2] = input[2] + input[29];
333
33.9M
  bf1[3] = input[3] + input[28];
334
33.9M
  bf1[4] = input[4] + input[27];
335
33.9M
  bf1[5] = input[5] + input[26];
336
33.9M
  bf1[6] = input[6] + input[25];
337
33.9M
  bf1[7] = input[7] + input[24];
338
33.9M
  bf1[8] = input[8] + input[23];
339
33.9M
  bf1[9] = input[9] + input[22];
340
33.9M
  bf1[10] = input[10] + input[21];
341
33.9M
  bf1[11] = input[11] + input[20];
342
33.9M
  bf1[12] = input[12] + input[19];
343
33.9M
  bf1[13] = input[13] + input[18];
344
33.9M
  bf1[14] = input[14] + input[17];
345
33.9M
  bf1[15] = input[15] + input[16];
346
33.9M
  bf1[16] = -input[16] + input[15];
347
33.9M
  bf1[17] = -input[17] + input[14];
348
33.9M
  bf1[18] = -input[18] + input[13];
349
33.9M
  bf1[19] = -input[19] + input[12];
350
33.9M
  bf1[20] = -input[20] + input[11];
351
33.9M
  bf1[21] = -input[21] + input[10];
352
33.9M
  bf1[22] = -input[22] + input[9];
353
33.9M
  bf1[23] = -input[23] + input[8];
354
33.9M
  bf1[24] = -input[24] + input[7];
355
33.9M
  bf1[25] = -input[25] + input[6];
356
33.9M
  bf1[26] = -input[26] + input[5];
357
33.9M
  bf1[27] = -input[27] + input[4];
358
33.9M
  bf1[28] = -input[28] + input[3];
359
33.9M
  bf1[29] = -input[29] + input[2];
360
33.9M
  bf1[30] = -input[30] + input[1];
361
33.9M
  bf1[31] = -input[31] + input[0];
362
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
363
364
  // stage 2
365
33.9M
  stage++;
366
33.9M
  cospi = cospi_arr(cos_bit);
367
33.9M
  bf0 = output;
368
33.9M
  bf1 = step;
369
33.9M
  bf1[0] = bf0[0] + bf0[15];
370
33.9M
  bf1[1] = bf0[1] + bf0[14];
371
33.9M
  bf1[2] = bf0[2] + bf0[13];
372
33.9M
  bf1[3] = bf0[3] + bf0[12];
373
33.9M
  bf1[4] = bf0[4] + bf0[11];
374
33.9M
  bf1[5] = bf0[5] + bf0[10];
375
33.9M
  bf1[6] = bf0[6] + bf0[9];
376
33.9M
  bf1[7] = bf0[7] + bf0[8];
377
33.9M
  bf1[8] = -bf0[8] + bf0[7];
378
33.9M
  bf1[9] = -bf0[9] + bf0[6];
379
33.9M
  bf1[10] = -bf0[10] + bf0[5];
380
33.9M
  bf1[11] = -bf0[11] + bf0[4];
381
33.9M
  bf1[12] = -bf0[12] + bf0[3];
382
33.9M
  bf1[13] = -bf0[13] + bf0[2];
383
33.9M
  bf1[14] = -bf0[14] + bf0[1];
384
33.9M
  bf1[15] = -bf0[15] + bf0[0];
385
33.9M
  bf1[16] = bf0[16];
386
33.9M
  bf1[17] = bf0[17];
387
33.9M
  bf1[18] = bf0[18];
388
33.9M
  bf1[19] = bf0[19];
389
33.9M
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
390
33.9M
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
391
33.9M
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
392
33.9M
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
393
33.9M
  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
394
33.9M
  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
395
33.9M
  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
396
33.9M
  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
397
33.9M
  bf1[28] = bf0[28];
398
33.9M
  bf1[29] = bf0[29];
399
33.9M
  bf1[30] = bf0[30];
400
33.9M
  bf1[31] = bf0[31];
401
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
402
403
  // stage 3
404
33.9M
  stage++;
405
33.9M
  cospi = cospi_arr(cos_bit);
406
33.9M
  bf0 = step;
407
33.9M
  bf1 = output;
408
33.9M
  bf1[0] = bf0[0] + bf0[7];
409
33.9M
  bf1[1] = bf0[1] + bf0[6];
410
33.9M
  bf1[2] = bf0[2] + bf0[5];
411
33.9M
  bf1[3] = bf0[3] + bf0[4];
412
33.9M
  bf1[4] = -bf0[4] + bf0[3];
413
33.9M
  bf1[5] = -bf0[5] + bf0[2];
414
33.9M
  bf1[6] = -bf0[6] + bf0[1];
415
33.9M
  bf1[7] = -bf0[7] + bf0[0];
416
33.9M
  bf1[8] = bf0[8];
417
33.9M
  bf1[9] = bf0[9];
418
33.9M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
419
33.9M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
420
33.9M
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
421
33.9M
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
422
33.9M
  bf1[14] = bf0[14];
423
33.9M
  bf1[15] = bf0[15];
424
33.9M
  bf1[16] = bf0[16] + bf0[23];
425
33.9M
  bf1[17] = bf0[17] + bf0[22];
426
33.9M
  bf1[18] = bf0[18] + bf0[21];
427
33.9M
  bf1[19] = bf0[19] + bf0[20];
428
33.9M
  bf1[20] = -bf0[20] + bf0[19];
429
33.9M
  bf1[21] = -bf0[21] + bf0[18];
430
33.9M
  bf1[22] = -bf0[22] + bf0[17];
431
33.9M
  bf1[23] = -bf0[23] + bf0[16];
432
33.9M
  bf1[24] = -bf0[24] + bf0[31];
433
33.9M
  bf1[25] = -bf0[25] + bf0[30];
434
33.9M
  bf1[26] = -bf0[26] + bf0[29];
435
33.9M
  bf1[27] = -bf0[27] + bf0[28];
436
33.9M
  bf1[28] = bf0[28] + bf0[27];
437
33.9M
  bf1[29] = bf0[29] + bf0[26];
438
33.9M
  bf1[30] = bf0[30] + bf0[25];
439
33.9M
  bf1[31] = bf0[31] + bf0[24];
440
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
441
442
  // stage 4
443
33.9M
  stage++;
444
33.9M
  cospi = cospi_arr(cos_bit);
445
33.9M
  bf0 = output;
446
33.9M
  bf1 = step;
447
33.9M
  bf1[0] = bf0[0] + bf0[3];
448
33.9M
  bf1[1] = bf0[1] + bf0[2];
449
33.9M
  bf1[2] = -bf0[2] + bf0[1];
450
33.9M
  bf1[3] = -bf0[3] + bf0[0];
451
33.9M
  bf1[4] = bf0[4];
452
33.9M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
453
33.9M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
454
33.9M
  bf1[7] = bf0[7];
455
33.9M
  bf1[8] = bf0[8] + bf0[11];
456
33.9M
  bf1[9] = bf0[9] + bf0[10];
457
33.9M
  bf1[10] = -bf0[10] + bf0[9];
458
33.9M
  bf1[11] = -bf0[11] + bf0[8];
459
33.9M
  bf1[12] = -bf0[12] + bf0[15];
460
33.9M
  bf1[13] = -bf0[13] + bf0[14];
461
33.9M
  bf1[14] = bf0[14] + bf0[13];
462
33.9M
  bf1[15] = bf0[15] + bf0[12];
463
33.9M
  bf1[16] = bf0[16];
464
33.9M
  bf1[17] = bf0[17];
465
33.9M
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
466
33.9M
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
467
33.9M
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
468
33.9M
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
469
33.9M
  bf1[22] = bf0[22];
470
33.9M
  bf1[23] = bf0[23];
471
33.9M
  bf1[24] = bf0[24];
472
33.9M
  bf1[25] = bf0[25];
473
33.9M
  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
474
33.9M
  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
475
33.9M
  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
476
33.9M
  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
477
33.9M
  bf1[30] = bf0[30];
478
33.9M
  bf1[31] = bf0[31];
479
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
480
481
  // stage 5
482
33.9M
  stage++;
483
33.9M
  cospi = cospi_arr(cos_bit);
484
33.9M
  bf0 = step;
485
33.9M
  bf1 = output;
486
33.9M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
487
33.9M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
488
33.9M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
489
33.9M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
490
33.9M
  bf1[4] = bf0[4] + bf0[5];
491
33.9M
  bf1[5] = -bf0[5] + bf0[4];
492
33.9M
  bf1[6] = -bf0[6] + bf0[7];
493
33.9M
  bf1[7] = bf0[7] + bf0[6];
494
33.9M
  bf1[8] = bf0[8];
495
33.9M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
496
33.9M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
497
33.9M
  bf1[11] = bf0[11];
498
33.9M
  bf1[12] = bf0[12];
499
33.9M
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
500
33.9M
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
501
33.9M
  bf1[15] = bf0[15];
502
33.9M
  bf1[16] = bf0[16] + bf0[19];
503
33.9M
  bf1[17] = bf0[17] + bf0[18];
504
33.9M
  bf1[18] = -bf0[18] + bf0[17];
505
33.9M
  bf1[19] = -bf0[19] + bf0[16];
506
33.9M
  bf1[20] = -bf0[20] + bf0[23];
507
33.9M
  bf1[21] = -bf0[21] + bf0[22];
508
33.9M
  bf1[22] = bf0[22] + bf0[21];
509
33.9M
  bf1[23] = bf0[23] + bf0[20];
510
33.9M
  bf1[24] = bf0[24] + bf0[27];
511
33.9M
  bf1[25] = bf0[25] + bf0[26];
512
33.9M
  bf1[26] = -bf0[26] + bf0[25];
513
33.9M
  bf1[27] = -bf0[27] + bf0[24];
514
33.9M
  bf1[28] = -bf0[28] + bf0[31];
515
33.9M
  bf1[29] = -bf0[29] + bf0[30];
516
33.9M
  bf1[30] = bf0[30] + bf0[29];
517
33.9M
  bf1[31] = bf0[31] + bf0[28];
518
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
519
520
  // stage 6
521
33.9M
  stage++;
522
33.9M
  cospi = cospi_arr(cos_bit);
523
33.9M
  bf0 = output;
524
33.9M
  bf1 = step;
525
33.9M
  bf1[0] = bf0[0];
526
33.9M
  bf1[1] = bf0[1];
527
33.9M
  bf1[2] = bf0[2];
528
33.9M
  bf1[3] = bf0[3];
529
33.9M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
530
33.9M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
531
33.9M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
532
33.9M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
533
33.9M
  bf1[8] = bf0[8] + bf0[9];
534
33.9M
  bf1[9] = -bf0[9] + bf0[8];
535
33.9M
  bf1[10] = -bf0[10] + bf0[11];
536
33.9M
  bf1[11] = bf0[11] + bf0[10];
537
33.9M
  bf1[12] = bf0[12] + bf0[13];
538
33.9M
  bf1[13] = -bf0[13] + bf0[12];
539
33.9M
  bf1[14] = -bf0[14] + bf0[15];
540
33.9M
  bf1[15] = bf0[15] + bf0[14];
541
33.9M
  bf1[16] = bf0[16];
542
33.9M
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
543
33.9M
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
544
33.9M
  bf1[19] = bf0[19];
545
33.9M
  bf1[20] = bf0[20];
546
33.9M
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
547
33.9M
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
548
33.9M
  bf1[23] = bf0[23];
549
33.9M
  bf1[24] = bf0[24];
550
33.9M
  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
551
33.9M
  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
552
33.9M
  bf1[27] = bf0[27];
553
33.9M
  bf1[28] = bf0[28];
554
33.9M
  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
555
33.9M
  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
556
33.9M
  bf1[31] = bf0[31];
557
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
558
559
  // stage 7
560
33.9M
  stage++;
561
33.9M
  cospi = cospi_arr(cos_bit);
562
33.9M
  bf0 = step;
563
33.9M
  bf1 = output;
564
33.9M
  bf1[0] = bf0[0];
565
33.9M
  bf1[1] = bf0[1];
566
33.9M
  bf1[2] = bf0[2];
567
33.9M
  bf1[3] = bf0[3];
568
33.9M
  bf1[4] = bf0[4];
569
33.9M
  bf1[5] = bf0[5];
570
33.9M
  bf1[6] = bf0[6];
571
33.9M
  bf1[7] = bf0[7];
572
33.9M
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
573
33.9M
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
574
33.9M
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
575
33.9M
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
576
33.9M
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
577
33.9M
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
578
33.9M
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
579
33.9M
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
580
33.9M
  bf1[16] = bf0[16] + bf0[17];
581
33.9M
  bf1[17] = -bf0[17] + bf0[16];
582
33.9M
  bf1[18] = -bf0[18] + bf0[19];
583
33.9M
  bf1[19] = bf0[19] + bf0[18];
584
33.9M
  bf1[20] = bf0[20] + bf0[21];
585
33.9M
  bf1[21] = -bf0[21] + bf0[20];
586
33.9M
  bf1[22] = -bf0[22] + bf0[23];
587
33.9M
  bf1[23] = bf0[23] + bf0[22];
588
33.9M
  bf1[24] = bf0[24] + bf0[25];
589
33.9M
  bf1[25] = -bf0[25] + bf0[24];
590
33.9M
  bf1[26] = -bf0[26] + bf0[27];
591
33.9M
  bf1[27] = bf0[27] + bf0[26];
592
33.9M
  bf1[28] = bf0[28] + bf0[29];
593
33.9M
  bf1[29] = -bf0[29] + bf0[28];
594
33.9M
  bf1[30] = -bf0[30] + bf0[31];
595
33.9M
  bf1[31] = bf0[31] + bf0[30];
596
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
597
598
  // stage 8
599
33.9M
  stage++;
600
33.9M
  cospi = cospi_arr(cos_bit);
601
33.9M
  bf0 = output;
602
33.9M
  bf1 = step;
603
33.9M
  bf1[0] = bf0[0];
604
33.9M
  bf1[1] = bf0[1];
605
33.9M
  bf1[2] = bf0[2];
606
33.9M
  bf1[3] = bf0[3];
607
33.9M
  bf1[4] = bf0[4];
608
33.9M
  bf1[5] = bf0[5];
609
33.9M
  bf1[6] = bf0[6];
610
33.9M
  bf1[7] = bf0[7];
611
33.9M
  bf1[8] = bf0[8];
612
33.9M
  bf1[9] = bf0[9];
613
33.9M
  bf1[10] = bf0[10];
614
33.9M
  bf1[11] = bf0[11];
615
33.9M
  bf1[12] = bf0[12];
616
33.9M
  bf1[13] = bf0[13];
617
33.9M
  bf1[14] = bf0[14];
618
33.9M
  bf1[15] = bf0[15];
619
33.9M
  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
620
33.9M
  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
621
33.9M
  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
622
33.9M
  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
623
33.9M
  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
624
33.9M
  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
625
33.9M
  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
626
33.9M
  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
627
33.9M
  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
628
33.9M
  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
629
33.9M
  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
630
33.9M
  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
631
33.9M
  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
632
33.9M
  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
633
33.9M
  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
634
33.9M
  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
635
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
636
637
  // stage 9
638
33.9M
  stage++;
639
33.9M
  bf0 = step;
640
33.9M
  bf1 = output;
641
33.9M
  bf1[0] = bf0[0];
642
33.9M
  bf1[1] = bf0[16];
643
33.9M
  bf1[2] = bf0[8];
644
33.9M
  bf1[3] = bf0[24];
645
33.9M
  bf1[4] = bf0[4];
646
33.9M
  bf1[5] = bf0[20];
647
33.9M
  bf1[6] = bf0[12];
648
33.9M
  bf1[7] = bf0[28];
649
33.9M
  bf1[8] = bf0[2];
650
33.9M
  bf1[9] = bf0[18];
651
33.9M
  bf1[10] = bf0[10];
652
33.9M
  bf1[11] = bf0[26];
653
33.9M
  bf1[12] = bf0[6];
654
33.9M
  bf1[13] = bf0[22];
655
33.9M
  bf1[14] = bf0[14];
656
33.9M
  bf1[15] = bf0[30];
657
33.9M
  bf1[16] = bf0[1];
658
33.9M
  bf1[17] = bf0[17];
659
33.9M
  bf1[18] = bf0[9];
660
33.9M
  bf1[19] = bf0[25];
661
33.9M
  bf1[20] = bf0[5];
662
33.9M
  bf1[21] = bf0[21];
663
33.9M
  bf1[22] = bf0[13];
664
33.9M
  bf1[23] = bf0[29];
665
33.9M
  bf1[24] = bf0[3];
666
33.9M
  bf1[25] = bf0[19];
667
33.9M
  bf1[26] = bf0[11];
668
33.9M
  bf1[27] = bf0[27];
669
33.9M
  bf1[28] = bf0[7];
670
33.9M
  bf1[29] = bf0[23];
671
33.9M
  bf1[30] = bf0[15];
672
33.9M
  bf1[31] = bf0[31];
673
33.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
674
33.9M
}
675
676
void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
677
2.90M
                const int8_t *stage_range) {
678
2.90M
  int bit = cos_bit;
679
2.90M
  const int32_t *sinpi = sinpi_arr(bit);
680
2.90M
  int32_t x0, x1, x2, x3;
681
2.90M
  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
682
683
  // stage 0
684
2.90M
  av1_range_check_buf(0, input, input, 4, stage_range[0]);
685
2.90M
  x0 = input[0];
686
2.90M
  x1 = input[1];
687
2.90M
  x2 = input[2];
688
2.90M
  x3 = input[3];
689
690
2.90M
  if (!(x0 | x1 | x2 | x3)) {
691
2.34M
    output[0] = output[1] = output[2] = output[3] = 0;
692
2.34M
    return;
693
2.34M
  }
694
695
  // stage 1
696
555k
  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
697
555k
  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
698
555k
  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
699
555k
  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
700
555k
  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
701
555k
  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
702
555k
  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
703
555k
  s7 = range_check_value(x0 + x1, stage_range[1]);
704
705
  // stage 2
706
555k
  s7 = range_check_value(s7 - x3, stage_range[2]);
707
708
  // stage 3
709
555k
  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
710
555k
  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
711
555k
  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
712
555k
  x3 = range_check_value(s4, bit + stage_range[3]);
713
714
  // stage 4
715
555k
  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
716
555k
  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
717
718
  // stage 5
719
555k
  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
720
555k
  s1 = range_check_value(x1, bit + stage_range[5]);
721
555k
  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
722
555k
  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
723
724
  // stage 6
725
555k
  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
726
727
  // 1-D transform scaling factor is sqrt(2).
728
555k
  output[0] = round_shift(s0, bit);
729
555k
  output[1] = round_shift(s1, bit);
730
555k
  output[2] = round_shift(s2, bit);
731
555k
  output[3] = round_shift(s3, bit);
732
555k
  av1_range_check_buf(6, input, output, 4, stage_range[6]);
733
555k
}
734
735
void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
736
10.6M
                const int8_t *stage_range) {
737
10.6M
  const int32_t size = 8;
738
10.6M
  const int32_t *cospi;
739
740
10.6M
  int32_t stage = 0;
741
10.6M
  int32_t *bf0, *bf1;
742
10.6M
  int32_t step[8];
743
744
  // stage 0;
745
10.6M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
746
747
  // stage 1;
748
10.6M
  stage++;
749
10.6M
  assert(output != input);
750
10.6M
  bf1 = output;
751
10.6M
  bf1[0] = input[0];
752
10.6M
  bf1[1] = -input[7];
753
10.6M
  bf1[2] = -input[3];
754
10.6M
  bf1[3] = input[4];
755
10.6M
  bf1[4] = -input[1];
756
10.6M
  bf1[5] = input[6];
757
10.6M
  bf1[6] = input[2];
758
10.6M
  bf1[7] = -input[5];
759
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
760
761
  // stage 2
762
10.6M
  stage++;
763
10.6M
  cospi = cospi_arr(cos_bit);
764
10.6M
  bf0 = output;
765
10.6M
  bf1 = step;
766
10.6M
  bf1[0] = bf0[0];
767
10.6M
  bf1[1] = bf0[1];
768
10.6M
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
769
10.6M
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
770
10.6M
  bf1[4] = bf0[4];
771
10.6M
  bf1[5] = bf0[5];
772
10.6M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
773
10.6M
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
774
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
775
776
  // stage 3
777
10.6M
  stage++;
778
10.6M
  bf0 = step;
779
10.6M
  bf1 = output;
780
10.6M
  bf1[0] = bf0[0] + bf0[2];
781
10.6M
  bf1[1] = bf0[1] + bf0[3];
782
10.6M
  bf1[2] = bf0[0] - bf0[2];
783
10.6M
  bf1[3] = bf0[1] - bf0[3];
784
10.6M
  bf1[4] = bf0[4] + bf0[6];
785
10.6M
  bf1[5] = bf0[5] + bf0[7];
786
10.6M
  bf1[6] = bf0[4] - bf0[6];
787
10.6M
  bf1[7] = bf0[5] - bf0[7];
788
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
789
790
  // stage 4
791
10.6M
  stage++;
792
10.6M
  cospi = cospi_arr(cos_bit);
793
10.6M
  bf0 = output;
794
10.6M
  bf1 = step;
795
10.6M
  bf1[0] = bf0[0];
796
10.6M
  bf1[1] = bf0[1];
797
10.6M
  bf1[2] = bf0[2];
798
10.6M
  bf1[3] = bf0[3];
799
10.6M
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
800
10.6M
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
801
10.6M
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
802
10.6M
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
803
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
804
805
  // stage 5
806
10.6M
  stage++;
807
10.6M
  bf0 = step;
808
10.6M
  bf1 = output;
809
10.6M
  bf1[0] = bf0[0] + bf0[4];
810
10.6M
  bf1[1] = bf0[1] + bf0[5];
811
10.6M
  bf1[2] = bf0[2] + bf0[6];
812
10.6M
  bf1[3] = bf0[3] + bf0[7];
813
10.6M
  bf1[4] = bf0[0] - bf0[4];
814
10.6M
  bf1[5] = bf0[1] - bf0[5];
815
10.6M
  bf1[6] = bf0[2] - bf0[6];
816
10.6M
  bf1[7] = bf0[3] - bf0[7];
817
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
818
819
  // stage 6
820
10.6M
  stage++;
821
10.6M
  cospi = cospi_arr(cos_bit);
822
10.6M
  bf0 = output;
823
10.6M
  bf1 = step;
824
10.6M
  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
825
10.6M
  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
826
10.6M
  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
827
10.6M
  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
828
10.6M
  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
829
10.6M
  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
830
10.6M
  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
831
10.6M
  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
832
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
833
834
  // stage 7
835
10.6M
  stage++;
836
10.6M
  bf0 = step;
837
10.6M
  bf1 = output;
838
10.6M
  bf1[0] = bf0[1];
839
10.6M
  bf1[1] = bf0[6];
840
10.6M
  bf1[2] = bf0[3];
841
10.6M
  bf1[3] = bf0[4];
842
10.6M
  bf1[4] = bf0[5];
843
10.6M
  bf1[5] = bf0[2];
844
10.6M
  bf1[6] = bf0[7];
845
10.6M
  bf1[7] = bf0[0];
846
10.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
847
10.6M
}
848
849
void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
850
9.37M
                 const int8_t *stage_range) {
851
9.37M
  const int32_t size = 16;
852
9.37M
  const int32_t *cospi;
853
854
9.37M
  int32_t stage = 0;
855
9.37M
  int32_t *bf0, *bf1;
856
9.37M
  int32_t step[16];
857
858
  // stage 0;
859
9.37M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
860
861
  // stage 1;
862
9.37M
  stage++;
863
9.37M
  assert(output != input);
864
9.37M
  bf1 = output;
865
9.37M
  bf1[0] = input[0];
866
9.37M
  bf1[1] = -input[15];
867
9.37M
  bf1[2] = -input[7];
868
9.37M
  bf1[3] = input[8];
869
9.37M
  bf1[4] = -input[3];
870
9.37M
  bf1[5] = input[12];
871
9.37M
  bf1[6] = input[4];
872
9.37M
  bf1[7] = -input[11];
873
9.37M
  bf1[8] = -input[1];
874
9.37M
  bf1[9] = input[14];
875
9.37M
  bf1[10] = input[6];
876
9.37M
  bf1[11] = -input[9];
877
9.37M
  bf1[12] = input[2];
878
9.37M
  bf1[13] = -input[13];
879
9.37M
  bf1[14] = -input[5];
880
9.37M
  bf1[15] = input[10];
881
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
882
883
  // stage 2
884
9.37M
  stage++;
885
9.37M
  cospi = cospi_arr(cos_bit);
886
9.37M
  bf0 = output;
887
9.37M
  bf1 = step;
888
9.37M
  bf1[0] = bf0[0];
889
9.37M
  bf1[1] = bf0[1];
890
9.37M
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
891
9.37M
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
892
9.37M
  bf1[4] = bf0[4];
893
9.37M
  bf1[5] = bf0[5];
894
9.37M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
895
9.37M
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
896
9.37M
  bf1[8] = bf0[8];
897
9.37M
  bf1[9] = bf0[9];
898
9.37M
  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
899
9.37M
  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
900
9.37M
  bf1[12] = bf0[12];
901
9.37M
  bf1[13] = bf0[13];
902
9.37M
  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
903
9.37M
  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
904
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
905
906
  // stage 3
907
9.37M
  stage++;
908
9.37M
  bf0 = step;
909
9.37M
  bf1 = output;
910
9.37M
  bf1[0] = bf0[0] + bf0[2];
911
9.37M
  bf1[1] = bf0[1] + bf0[3];
912
9.37M
  bf1[2] = bf0[0] - bf0[2];
913
9.37M
  bf1[3] = bf0[1] - bf0[3];
914
9.37M
  bf1[4] = bf0[4] + bf0[6];
915
9.37M
  bf1[5] = bf0[5] + bf0[7];
916
9.37M
  bf1[6] = bf0[4] - bf0[6];
917
9.37M
  bf1[7] = bf0[5] - bf0[7];
918
9.37M
  bf1[8] = bf0[8] + bf0[10];
919
9.37M
  bf1[9] = bf0[9] + bf0[11];
920
9.37M
  bf1[10] = bf0[8] - bf0[10];
921
9.37M
  bf1[11] = bf0[9] - bf0[11];
922
9.37M
  bf1[12] = bf0[12] + bf0[14];
923
9.37M
  bf1[13] = bf0[13] + bf0[15];
924
9.37M
  bf1[14] = bf0[12] - bf0[14];
925
9.37M
  bf1[15] = bf0[13] - bf0[15];
926
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
927
928
  // stage 4
929
9.37M
  stage++;
930
9.37M
  cospi = cospi_arr(cos_bit);
931
9.37M
  bf0 = output;
932
9.37M
  bf1 = step;
933
9.37M
  bf1[0] = bf0[0];
934
9.37M
  bf1[1] = bf0[1];
935
9.37M
  bf1[2] = bf0[2];
936
9.37M
  bf1[3] = bf0[3];
937
9.37M
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
938
9.37M
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
939
9.37M
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
940
9.37M
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
941
9.37M
  bf1[8] = bf0[8];
942
9.37M
  bf1[9] = bf0[9];
943
9.37M
  bf1[10] = bf0[10];
944
9.37M
  bf1[11] = bf0[11];
945
9.37M
  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
946
9.37M
  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
947
9.37M
  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
948
9.37M
  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
949
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
950
951
  // stage 5
952
9.37M
  stage++;
953
9.37M
  bf0 = step;
954
9.37M
  bf1 = output;
955
9.37M
  bf1[0] = bf0[0] + bf0[4];
956
9.37M
  bf1[1] = bf0[1] + bf0[5];
957
9.37M
  bf1[2] = bf0[2] + bf0[6];
958
9.37M
  bf1[3] = bf0[3] + bf0[7];
959
9.37M
  bf1[4] = bf0[0] - bf0[4];
960
9.37M
  bf1[5] = bf0[1] - bf0[5];
961
9.37M
  bf1[6] = bf0[2] - bf0[6];
962
9.37M
  bf1[7] = bf0[3] - bf0[7];
963
9.37M
  bf1[8] = bf0[8] + bf0[12];
964
9.37M
  bf1[9] = bf0[9] + bf0[13];
965
9.37M
  bf1[10] = bf0[10] + bf0[14];
966
9.37M
  bf1[11] = bf0[11] + bf0[15];
967
9.37M
  bf1[12] = bf0[8] - bf0[12];
968
9.37M
  bf1[13] = bf0[9] - bf0[13];
969
9.37M
  bf1[14] = bf0[10] - bf0[14];
970
9.37M
  bf1[15] = bf0[11] - bf0[15];
971
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
972
973
  // stage 6
974
9.37M
  stage++;
975
9.37M
  cospi = cospi_arr(cos_bit);
976
9.37M
  bf0 = output;
977
9.37M
  bf1 = step;
978
9.37M
  bf1[0] = bf0[0];
979
9.37M
  bf1[1] = bf0[1];
980
9.37M
  bf1[2] = bf0[2];
981
9.37M
  bf1[3] = bf0[3];
982
9.37M
  bf1[4] = bf0[4];
983
9.37M
  bf1[5] = bf0[5];
984
9.37M
  bf1[6] = bf0[6];
985
9.37M
  bf1[7] = bf0[7];
986
9.37M
  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
987
9.37M
  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
988
9.37M
  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
989
9.37M
  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
990
9.37M
  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
991
9.37M
  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
992
9.37M
  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
993
9.37M
  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
994
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
995
996
  // stage 7
997
9.37M
  stage++;
998
9.37M
  bf0 = step;
999
9.37M
  bf1 = output;
1000
9.37M
  bf1[0] = bf0[0] + bf0[8];
1001
9.37M
  bf1[1] = bf0[1] + bf0[9];
1002
9.37M
  bf1[2] = bf0[2] + bf0[10];
1003
9.37M
  bf1[3] = bf0[3] + bf0[11];
1004
9.37M
  bf1[4] = bf0[4] + bf0[12];
1005
9.37M
  bf1[5] = bf0[5] + bf0[13];
1006
9.37M
  bf1[6] = bf0[6] + bf0[14];
1007
9.37M
  bf1[7] = bf0[7] + bf0[15];
1008
9.37M
  bf1[8] = bf0[0] - bf0[8];
1009
9.37M
  bf1[9] = bf0[1] - bf0[9];
1010
9.37M
  bf1[10] = bf0[2] - bf0[10];
1011
9.37M
  bf1[11] = bf0[3] - bf0[11];
1012
9.37M
  bf1[12] = bf0[4] - bf0[12];
1013
9.37M
  bf1[13] = bf0[5] - bf0[13];
1014
9.37M
  bf1[14] = bf0[6] - bf0[14];
1015
9.37M
  bf1[15] = bf0[7] - bf0[15];
1016
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1017
1018
  // stage 8
1019
9.37M
  stage++;
1020
9.37M
  cospi = cospi_arr(cos_bit);
1021
9.37M
  bf0 = output;
1022
9.37M
  bf1 = step;
1023
9.37M
  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1024
9.37M
  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1025
9.37M
  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1026
9.37M
  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1027
9.37M
  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1028
9.37M
  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1029
9.37M
  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1030
9.37M
  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1031
9.37M
  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1032
9.37M
  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1033
9.37M
  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1034
9.37M
  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1035
9.37M
  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1036
9.37M
  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1037
9.37M
  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1038
9.37M
  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1039
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1040
1041
  // stage 9
1042
9.37M
  stage++;
1043
9.37M
  bf0 = step;
1044
9.37M
  bf1 = output;
1045
9.37M
  bf1[0] = bf0[1];
1046
9.37M
  bf1[1] = bf0[14];
1047
9.37M
  bf1[2] = bf0[3];
1048
9.37M
  bf1[3] = bf0[12];
1049
9.37M
  bf1[4] = bf0[5];
1050
9.37M
  bf1[5] = bf0[10];
1051
9.37M
  bf1[6] = bf0[7];
1052
9.37M
  bf1[7] = bf0[8];
1053
9.37M
  bf1[8] = bf0[9];
1054
9.37M
  bf1[9] = bf0[6];
1055
9.37M
  bf1[10] = bf0[11];
1056
9.37M
  bf1[11] = bf0[4];
1057
9.37M
  bf1[12] = bf0[13];
1058
9.37M
  bf1[13] = bf0[2];
1059
9.37M
  bf1[14] = bf0[15];
1060
9.37M
  bf1[15] = bf0[0];
1061
9.37M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1062
9.37M
}
1063
1064
void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1065
475k
                      const int8_t *stage_range) {
1066
475k
  (void)cos_bit;
1067
2.37M
  for (int i = 0; i < 4; ++i)
1068
1.90M
    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
1069
475k
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1070
475k
  av1_range_check_buf(0, input, output, 4, stage_range[0]);
1071
475k
}
1072
1073
void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1074
877k
                      const int8_t *stage_range) {
1075
877k
  (void)cos_bit;
1076
7.90M
  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1077
877k
  av1_range_check_buf(0, input, output, 8, stage_range[0]);
1078
877k
}
1079
1080
void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1081
1.92M
                       const int8_t *stage_range) {
1082
1.92M
  (void)cos_bit;
1083
32.7M
  for (int i = 0; i < 16; ++i)
1084
30.8M
    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
1085
1.92M
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1086
1.92M
  av1_range_check_buf(0, input, output, 16, stage_range[0]);
1087
1.92M
}
1088
1089
void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1090
0
                       const int8_t *stage_range) {
1091
0
  (void)cos_bit;
1092
0
  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1093
0
  av1_range_check_buf(0, input, output, 32, stage_range[0]);
1094
0
}
1095
1096
void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
1097
32.3M
                const int8_t *stage_range) {
1098
32.3M
  const int32_t size = 64;
1099
32.3M
  const int32_t *cospi;
1100
1101
32.3M
  int32_t stage = 0;
1102
32.3M
  int32_t *bf0, *bf1;
1103
32.3M
  int32_t step[64];
1104
1105
  // stage 0;
1106
32.3M
  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
1107
1108
  // stage 1;
1109
32.3M
  stage++;
1110
32.3M
  bf1 = output;
1111
32.3M
  bf1[0] = input[0] + input[63];
1112
32.3M
  bf1[1] = input[1] + input[62];
1113
32.3M
  bf1[2] = input[2] + input[61];
1114
32.3M
  bf1[3] = input[3] + input[60];
1115
32.3M
  bf1[4] = input[4] + input[59];
1116
32.3M
  bf1[5] = input[5] + input[58];
1117
32.3M
  bf1[6] = input[6] + input[57];
1118
32.3M
  bf1[7] = input[7] + input[56];
1119
32.3M
  bf1[8] = input[8] + input[55];
1120
32.3M
  bf1[9] = input[9] + input[54];
1121
32.3M
  bf1[10] = input[10] + input[53];
1122
32.3M
  bf1[11] = input[11] + input[52];
1123
32.3M
  bf1[12] = input[12] + input[51];
1124
32.3M
  bf1[13] = input[13] + input[50];
1125
32.3M
  bf1[14] = input[14] + input[49];
1126
32.3M
  bf1[15] = input[15] + input[48];
1127
32.3M
  bf1[16] = input[16] + input[47];
1128
32.3M
  bf1[17] = input[17] + input[46];
1129
32.3M
  bf1[18] = input[18] + input[45];
1130
32.3M
  bf1[19] = input[19] + input[44];
1131
32.3M
  bf1[20] = input[20] + input[43];
1132
32.3M
  bf1[21] = input[21] + input[42];
1133
32.3M
  bf1[22] = input[22] + input[41];
1134
32.3M
  bf1[23] = input[23] + input[40];
1135
32.3M
  bf1[24] = input[24] + input[39];
1136
32.3M
  bf1[25] = input[25] + input[38];
1137
32.3M
  bf1[26] = input[26] + input[37];
1138
32.3M
  bf1[27] = input[27] + input[36];
1139
32.3M
  bf1[28] = input[28] + input[35];
1140
32.3M
  bf1[29] = input[29] + input[34];
1141
32.3M
  bf1[30] = input[30] + input[33];
1142
32.3M
  bf1[31] = input[31] + input[32];
1143
32.3M
  bf1[32] = -input[32] + input[31];
1144
32.3M
  bf1[33] = -input[33] + input[30];
1145
32.3M
  bf1[34] = -input[34] + input[29];
1146
32.3M
  bf1[35] = -input[35] + input[28];
1147
32.3M
  bf1[36] = -input[36] + input[27];
1148
32.3M
  bf1[37] = -input[37] + input[26];
1149
32.3M
  bf1[38] = -input[38] + input[25];
1150
32.3M
  bf1[39] = -input[39] + input[24];
1151
32.3M
  bf1[40] = -input[40] + input[23];
1152
32.3M
  bf1[41] = -input[41] + input[22];
1153
32.3M
  bf1[42] = -input[42] + input[21];
1154
32.3M
  bf1[43] = -input[43] + input[20];
1155
32.3M
  bf1[44] = -input[44] + input[19];
1156
32.3M
  bf1[45] = -input[45] + input[18];
1157
32.3M
  bf1[46] = -input[46] + input[17];
1158
32.3M
  bf1[47] = -input[47] + input[16];
1159
32.3M
  bf1[48] = -input[48] + input[15];
1160
32.3M
  bf1[49] = -input[49] + input[14];
1161
32.3M
  bf1[50] = -input[50] + input[13];
1162
32.3M
  bf1[51] = -input[51] + input[12];
1163
32.3M
  bf1[52] = -input[52] + input[11];
1164
32.3M
  bf1[53] = -input[53] + input[10];
1165
32.3M
  bf1[54] = -input[54] + input[9];
1166
32.3M
  bf1[55] = -input[55] + input[8];
1167
32.3M
  bf1[56] = -input[56] + input[7];
1168
32.3M
  bf1[57] = -input[57] + input[6];
1169
32.3M
  bf1[58] = -input[58] + input[5];
1170
32.3M
  bf1[59] = -input[59] + input[4];
1171
32.3M
  bf1[60] = -input[60] + input[3];
1172
32.3M
  bf1[61] = -input[61] + input[2];
1173
32.3M
  bf1[62] = -input[62] + input[1];
1174
32.3M
  bf1[63] = -input[63] + input[0];
1175
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1176
1177
  // stage 2
1178
32.3M
  stage++;
1179
32.3M
  cospi = cospi_arr(cos_bit);
1180
32.3M
  bf0 = output;
1181
32.3M
  bf1 = step;
1182
32.3M
  bf1[0] = bf0[0] + bf0[31];
1183
32.3M
  bf1[1] = bf0[1] + bf0[30];
1184
32.3M
  bf1[2] = bf0[2] + bf0[29];
1185
32.3M
  bf1[3] = bf0[3] + bf0[28];
1186
32.3M
  bf1[4] = bf0[4] + bf0[27];
1187
32.3M
  bf1[5] = bf0[5] + bf0[26];
1188
32.3M
  bf1[6] = bf0[6] + bf0[25];
1189
32.3M
  bf1[7] = bf0[7] + bf0[24];
1190
32.3M
  bf1[8] = bf0[8] + bf0[23];
1191
32.3M
  bf1[9] = bf0[9] + bf0[22];
1192
32.3M
  bf1[10] = bf0[10] + bf0[21];
1193
32.3M
  bf1[11] = bf0[11] + bf0[20];
1194
32.3M
  bf1[12] = bf0[12] + bf0[19];
1195
32.3M
  bf1[13] = bf0[13] + bf0[18];
1196
32.3M
  bf1[14] = bf0[14] + bf0[17];
1197
32.3M
  bf1[15] = bf0[15] + bf0[16];
1198
32.3M
  bf1[16] = -bf0[16] + bf0[15];
1199
32.3M
  bf1[17] = -bf0[17] + bf0[14];
1200
32.3M
  bf1[18] = -bf0[18] + bf0[13];
1201
32.3M
  bf1[19] = -bf0[19] + bf0[12];
1202
32.3M
  bf1[20] = -bf0[20] + bf0[11];
1203
32.3M
  bf1[21] = -bf0[21] + bf0[10];
1204
32.3M
  bf1[22] = -bf0[22] + bf0[9];
1205
32.3M
  bf1[23] = -bf0[23] + bf0[8];
1206
32.3M
  bf1[24] = -bf0[24] + bf0[7];
1207
32.3M
  bf1[25] = -bf0[25] + bf0[6];
1208
32.3M
  bf1[26] = -bf0[26] + bf0[5];
1209
32.3M
  bf1[27] = -bf0[27] + bf0[4];
1210
32.3M
  bf1[28] = -bf0[28] + bf0[3];
1211
32.3M
  bf1[29] = -bf0[29] + bf0[2];
1212
32.3M
  bf1[30] = -bf0[30] + bf0[1];
1213
32.3M
  bf1[31] = -bf0[31] + bf0[0];
1214
32.3M
  bf1[32] = bf0[32];
1215
32.3M
  bf1[33] = bf0[33];
1216
32.3M
  bf1[34] = bf0[34];
1217
32.3M
  bf1[35] = bf0[35];
1218
32.3M
  bf1[36] = bf0[36];
1219
32.3M
  bf1[37] = bf0[37];
1220
32.3M
  bf1[38] = bf0[38];
1221
32.3M
  bf1[39] = bf0[39];
1222
32.3M
  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1223
32.3M
  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1224
32.3M
  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1225
32.3M
  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1226
32.3M
  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1227
32.3M
  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1228
32.3M
  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1229
32.3M
  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1230
32.3M
  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
1231
32.3M
  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
1232
32.3M
  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
1233
32.3M
  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
1234
32.3M
  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
1235
32.3M
  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
1236
32.3M
  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
1237
32.3M
  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
1238
32.3M
  bf1[56] = bf0[56];
1239
32.3M
  bf1[57] = bf0[57];
1240
32.3M
  bf1[58] = bf0[58];
1241
32.3M
  bf1[59] = bf0[59];
1242
32.3M
  bf1[60] = bf0[60];
1243
32.3M
  bf1[61] = bf0[61];
1244
32.3M
  bf1[62] = bf0[62];
1245
32.3M
  bf1[63] = bf0[63];
1246
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1247
1248
  // stage 3
1249
32.3M
  stage++;
1250
32.3M
  cospi = cospi_arr(cos_bit);
1251
32.3M
  bf0 = step;
1252
32.3M
  bf1 = output;
1253
32.3M
  bf1[0] = bf0[0] + bf0[15];
1254
32.3M
  bf1[1] = bf0[1] + bf0[14];
1255
32.3M
  bf1[2] = bf0[2] + bf0[13];
1256
32.3M
  bf1[3] = bf0[3] + bf0[12];
1257
32.3M
  bf1[4] = bf0[4] + bf0[11];
1258
32.3M
  bf1[5] = bf0[5] + bf0[10];
1259
32.3M
  bf1[6] = bf0[6] + bf0[9];
1260
32.3M
  bf1[7] = bf0[7] + bf0[8];
1261
32.3M
  bf1[8] = -bf0[8] + bf0[7];
1262
32.3M
  bf1[9] = -bf0[9] + bf0[6];
1263
32.3M
  bf1[10] = -bf0[10] + bf0[5];
1264
32.3M
  bf1[11] = -bf0[11] + bf0[4];
1265
32.3M
  bf1[12] = -bf0[12] + bf0[3];
1266
32.3M
  bf1[13] = -bf0[13] + bf0[2];
1267
32.3M
  bf1[14] = -bf0[14] + bf0[1];
1268
32.3M
  bf1[15] = -bf0[15] + bf0[0];
1269
32.3M
  bf1[16] = bf0[16];
1270
32.3M
  bf1[17] = bf0[17];
1271
32.3M
  bf1[18] = bf0[18];
1272
32.3M
  bf1[19] = bf0[19];
1273
32.3M
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1274
32.3M
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1275
32.3M
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1276
32.3M
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1277
32.3M
  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1278
32.3M
  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1279
32.3M
  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1280
32.3M
  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1281
32.3M
  bf1[28] = bf0[28];
1282
32.3M
  bf1[29] = bf0[29];
1283
32.3M
  bf1[30] = bf0[30];
1284
32.3M
  bf1[31] = bf0[31];
1285
32.3M
  bf1[32] = bf0[32] + bf0[47];
1286
32.3M
  bf1[33] = bf0[33] + bf0[46];
1287
32.3M
  bf1[34] = bf0[34] + bf0[45];
1288
32.3M
  bf1[35] = bf0[35] + bf0[44];
1289
32.3M
  bf1[36] = bf0[36] + bf0[43];
1290
32.3M
  bf1[37] = bf0[37] + bf0[42];
1291
32.3M
  bf1[38] = bf0[38] + bf0[41];
1292
32.3M
  bf1[39] = bf0[39] + bf0[40];
1293
32.3M
  bf1[40] = -bf0[40] + bf0[39];
1294
32.3M
  bf1[41] = -bf0[41] + bf0[38];
1295
32.3M
  bf1[42] = -bf0[42] + bf0[37];
1296
32.3M
  bf1[43] = -bf0[43] + bf0[36];
1297
32.3M
  bf1[44] = -bf0[44] + bf0[35];
1298
32.3M
  bf1[45] = -bf0[45] + bf0[34];
1299
32.3M
  bf1[46] = -bf0[46] + bf0[33];
1300
32.3M
  bf1[47] = -bf0[47] + bf0[32];
1301
32.3M
  bf1[48] = -bf0[48] + bf0[63];
1302
32.3M
  bf1[49] = -bf0[49] + bf0[62];
1303
32.3M
  bf1[50] = -bf0[50] + bf0[61];
1304
32.3M
  bf1[51] = -bf0[51] + bf0[60];
1305
32.3M
  bf1[52] = -bf0[52] + bf0[59];
1306
32.3M
  bf1[53] = -bf0[53] + bf0[58];
1307
32.3M
  bf1[54] = -bf0[54] + bf0[57];
1308
32.3M
  bf1[55] = -bf0[55] + bf0[56];
1309
32.3M
  bf1[56] = bf0[56] + bf0[55];
1310
32.3M
  bf1[57] = bf0[57] + bf0[54];
1311
32.3M
  bf1[58] = bf0[58] + bf0[53];
1312
32.3M
  bf1[59] = bf0[59] + bf0[52];
1313
32.3M
  bf1[60] = bf0[60] + bf0[51];
1314
32.3M
  bf1[61] = bf0[61] + bf0[50];
1315
32.3M
  bf1[62] = bf0[62] + bf0[49];
1316
32.3M
  bf1[63] = bf0[63] + bf0[48];
1317
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1318
1319
  // stage 4
1320
32.3M
  stage++;
1321
32.3M
  cospi = cospi_arr(cos_bit);
1322
32.3M
  bf0 = output;
1323
32.3M
  bf1 = step;
1324
32.3M
  bf1[0] = bf0[0] + bf0[7];
1325
32.3M
  bf1[1] = bf0[1] + bf0[6];
1326
32.3M
  bf1[2] = bf0[2] + bf0[5];
1327
32.3M
  bf1[3] = bf0[3] + bf0[4];
1328
32.3M
  bf1[4] = -bf0[4] + bf0[3];
1329
32.3M
  bf1[5] = -bf0[5] + bf0[2];
1330
32.3M
  bf1[6] = -bf0[6] + bf0[1];
1331
32.3M
  bf1[7] = -bf0[7] + bf0[0];
1332
32.3M
  bf1[8] = bf0[8];
1333
32.3M
  bf1[9] = bf0[9];
1334
32.3M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1335
32.3M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1336
32.3M
  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1337
32.3M
  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1338
32.3M
  bf1[14] = bf0[14];
1339
32.3M
  bf1[15] = bf0[15];
1340
32.3M
  bf1[16] = bf0[16] + bf0[23];
1341
32.3M
  bf1[17] = bf0[17] + bf0[22];
1342
32.3M
  bf1[18] = bf0[18] + bf0[21];
1343
32.3M
  bf1[19] = bf0[19] + bf0[20];
1344
32.3M
  bf1[20] = -bf0[20] + bf0[19];
1345
32.3M
  bf1[21] = -bf0[21] + bf0[18];
1346
32.3M
  bf1[22] = -bf0[22] + bf0[17];
1347
32.3M
  bf1[23] = -bf0[23] + bf0[16];
1348
32.3M
  bf1[24] = -bf0[24] + bf0[31];
1349
32.3M
  bf1[25] = -bf0[25] + bf0[30];
1350
32.3M
  bf1[26] = -bf0[26] + bf0[29];
1351
32.3M
  bf1[27] = -bf0[27] + bf0[28];
1352
32.3M
  bf1[28] = bf0[28] + bf0[27];
1353
32.3M
  bf1[29] = bf0[29] + bf0[26];
1354
32.3M
  bf1[30] = bf0[30] + bf0[25];
1355
32.3M
  bf1[31] = bf0[31] + bf0[24];
1356
32.3M
  bf1[32] = bf0[32];
1357
32.3M
  bf1[33] = bf0[33];
1358
32.3M
  bf1[34] = bf0[34];
1359
32.3M
  bf1[35] = bf0[35];
1360
32.3M
  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1361
32.3M
  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1362
32.3M
  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1363
32.3M
  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1364
32.3M
  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1365
32.3M
  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1366
32.3M
  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1367
32.3M
  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1368
32.3M
  bf1[44] = bf0[44];
1369
32.3M
  bf1[45] = bf0[45];
1370
32.3M
  bf1[46] = bf0[46];
1371
32.3M
  bf1[47] = bf0[47];
1372
32.3M
  bf1[48] = bf0[48];
1373
32.3M
  bf1[49] = bf0[49];
1374
32.3M
  bf1[50] = bf0[50];
1375
32.3M
  bf1[51] = bf0[51];
1376
32.3M
  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
1377
32.3M
  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
1378
32.3M
  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
1379
32.3M
  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
1380
32.3M
  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
1381
32.3M
  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
1382
32.3M
  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
1383
32.3M
  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
1384
32.3M
  bf1[60] = bf0[60];
1385
32.3M
  bf1[61] = bf0[61];
1386
32.3M
  bf1[62] = bf0[62];
1387
32.3M
  bf1[63] = bf0[63];
1388
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1389
1390
  // stage 5
1391
32.3M
  stage++;
1392
32.3M
  cospi = cospi_arr(cos_bit);
1393
32.3M
  bf0 = step;
1394
32.3M
  bf1 = output;
1395
32.3M
  bf1[0] = bf0[0] + bf0[3];
1396
32.3M
  bf1[1] = bf0[1] + bf0[2];
1397
32.3M
  bf1[2] = -bf0[2] + bf0[1];
1398
32.3M
  bf1[3] = -bf0[3] + bf0[0];
1399
32.3M
  bf1[4] = bf0[4];
1400
32.3M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1401
32.3M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1402
32.3M
  bf1[7] = bf0[7];
1403
32.3M
  bf1[8] = bf0[8] + bf0[11];
1404
32.3M
  bf1[9] = bf0[9] + bf0[10];
1405
32.3M
  bf1[10] = -bf0[10] + bf0[9];
1406
32.3M
  bf1[11] = -bf0[11] + bf0[8];
1407
32.3M
  bf1[12] = -bf0[12] + bf0[15];
1408
32.3M
  bf1[13] = -bf0[13] + bf0[14];
1409
32.3M
  bf1[14] = bf0[14] + bf0[13];
1410
32.3M
  bf1[15] = bf0[15] + bf0[12];
1411
32.3M
  bf1[16] = bf0[16];
1412
32.3M
  bf1[17] = bf0[17];
1413
32.3M
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1414
32.3M
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1415
32.3M
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1416
32.3M
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1417
32.3M
  bf1[22] = bf0[22];
1418
32.3M
  bf1[23] = bf0[23];
1419
32.3M
  bf1[24] = bf0[24];
1420
32.3M
  bf1[25] = bf0[25];
1421
32.3M
  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1422
32.3M
  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1423
32.3M
  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1424
32.3M
  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1425
32.3M
  bf1[30] = bf0[30];
1426
32.3M
  bf1[31] = bf0[31];
1427
32.3M
  bf1[32] = bf0[32] + bf0[39];
1428
32.3M
  bf1[33] = bf0[33] + bf0[38];
1429
32.3M
  bf1[34] = bf0[34] + bf0[37];
1430
32.3M
  bf1[35] = bf0[35] + bf0[36];
1431
32.3M
  bf1[36] = -bf0[36] + bf0[35];
1432
32.3M
  bf1[37] = -bf0[37] + bf0[34];
1433
32.3M
  bf1[38] = -bf0[38] + bf0[33];
1434
32.3M
  bf1[39] = -bf0[39] + bf0[32];
1435
32.3M
  bf1[40] = -bf0[40] + bf0[47];
1436
32.3M
  bf1[41] = -bf0[41] + bf0[46];
1437
32.3M
  bf1[42] = -bf0[42] + bf0[45];
1438
32.3M
  bf1[43] = -bf0[43] + bf0[44];
1439
32.3M
  bf1[44] = bf0[44] + bf0[43];
1440
32.3M
  bf1[45] = bf0[45] + bf0[42];
1441
32.3M
  bf1[46] = bf0[46] + bf0[41];
1442
32.3M
  bf1[47] = bf0[47] + bf0[40];
1443
32.3M
  bf1[48] = bf0[48] + bf0[55];
1444
32.3M
  bf1[49] = bf0[49] + bf0[54];
1445
32.3M
  bf1[50] = bf0[50] + bf0[53];
1446
32.3M
  bf1[51] = bf0[51] + bf0[52];
1447
32.3M
  bf1[52] = -bf0[52] + bf0[51];
1448
32.3M
  bf1[53] = -bf0[53] + bf0[50];
1449
32.3M
  bf1[54] = -bf0[54] + bf0[49];
1450
32.3M
  bf1[55] = -bf0[55] + bf0[48];
1451
32.3M
  bf1[56] = -bf0[56] + bf0[63];
1452
32.3M
  bf1[57] = -bf0[57] + bf0[62];
1453
32.3M
  bf1[58] = -bf0[58] + bf0[61];
1454
32.3M
  bf1[59] = -bf0[59] + bf0[60];
1455
32.3M
  bf1[60] = bf0[60] + bf0[59];
1456
32.3M
  bf1[61] = bf0[61] + bf0[58];
1457
32.3M
  bf1[62] = bf0[62] + bf0[57];
1458
32.3M
  bf1[63] = bf0[63] + bf0[56];
1459
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1460
1461
  // stage 6
1462
32.3M
  stage++;
1463
32.3M
  cospi = cospi_arr(cos_bit);
1464
32.3M
  bf0 = output;
1465
32.3M
  bf1 = step;
1466
32.3M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1467
32.3M
  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1468
32.3M
  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1469
32.3M
  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1470
32.3M
  bf1[4] = bf0[4] + bf0[5];
1471
32.3M
  bf1[5] = -bf0[5] + bf0[4];
1472
32.3M
  bf1[6] = -bf0[6] + bf0[7];
1473
32.3M
  bf1[7] = bf0[7] + bf0[6];
1474
32.3M
  bf1[8] = bf0[8];
1475
32.3M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1476
32.3M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1477
32.3M
  bf1[11] = bf0[11];
1478
32.3M
  bf1[12] = bf0[12];
1479
32.3M
  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1480
32.3M
  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1481
32.3M
  bf1[15] = bf0[15];
1482
32.3M
  bf1[16] = bf0[16] + bf0[19];
1483
32.3M
  bf1[17] = bf0[17] + bf0[18];
1484
32.3M
  bf1[18] = -bf0[18] + bf0[17];
1485
32.3M
  bf1[19] = -bf0[19] + bf0[16];
1486
32.3M
  bf1[20] = -bf0[20] + bf0[23];
1487
32.3M
  bf1[21] = -bf0[21] + bf0[22];
1488
32.3M
  bf1[22] = bf0[22] + bf0[21];
1489
32.3M
  bf1[23] = bf0[23] + bf0[20];
1490
32.3M
  bf1[24] = bf0[24] + bf0[27];
1491
32.3M
  bf1[25] = bf0[25] + bf0[26];
1492
32.3M
  bf1[26] = -bf0[26] + bf0[25];
1493
32.3M
  bf1[27] = -bf0[27] + bf0[24];
1494
32.3M
  bf1[28] = -bf0[28] + bf0[31];
1495
32.3M
  bf1[29] = -bf0[29] + bf0[30];
1496
32.3M
  bf1[30] = bf0[30] + bf0[29];
1497
32.3M
  bf1[31] = bf0[31] + bf0[28];
1498
32.3M
  bf1[32] = bf0[32];
1499
32.3M
  bf1[33] = bf0[33];
1500
32.3M
  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1501
32.3M
  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1502
32.3M
  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1503
32.3M
  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1504
32.3M
  bf1[38] = bf0[38];
1505
32.3M
  bf1[39] = bf0[39];
1506
32.3M
  bf1[40] = bf0[40];
1507
32.3M
  bf1[41] = bf0[41];
1508
32.3M
  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1509
32.3M
  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1510
32.3M
  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1511
32.3M
  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1512
32.3M
  bf1[46] = bf0[46];
1513
32.3M
  bf1[47] = bf0[47];
1514
32.3M
  bf1[48] = bf0[48];
1515
32.3M
  bf1[49] = bf0[49];
1516
32.3M
  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1517
32.3M
  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1518
32.3M
  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1519
32.3M
  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1520
32.3M
  bf1[54] = bf0[54];
1521
32.3M
  bf1[55] = bf0[55];
1522
32.3M
  bf1[56] = bf0[56];
1523
32.3M
  bf1[57] = bf0[57];
1524
32.3M
  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1525
32.3M
  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1526
32.3M
  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1527
32.3M
  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1528
32.3M
  bf1[62] = bf0[62];
1529
32.3M
  bf1[63] = bf0[63];
1530
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1531
1532
  // stage 7
1533
32.3M
  stage++;
1534
32.3M
  cospi = cospi_arr(cos_bit);
1535
32.3M
  bf0 = step;
1536
32.3M
  bf1 = output;
1537
32.3M
  bf1[0] = bf0[0];
1538
32.3M
  bf1[1] = bf0[1];
1539
32.3M
  bf1[2] = bf0[2];
1540
32.3M
  bf1[3] = bf0[3];
1541
32.3M
  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1542
32.3M
  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1543
32.3M
  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1544
32.3M
  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1545
32.3M
  bf1[8] = bf0[8] + bf0[9];
1546
32.3M
  bf1[9] = -bf0[9] + bf0[8];
1547
32.3M
  bf1[10] = -bf0[10] + bf0[11];
1548
32.3M
  bf1[11] = bf0[11] + bf0[10];
1549
32.3M
  bf1[12] = bf0[12] + bf0[13];
1550
32.3M
  bf1[13] = -bf0[13] + bf0[12];
1551
32.3M
  bf1[14] = -bf0[14] + bf0[15];
1552
32.3M
  bf1[15] = bf0[15] + bf0[14];
1553
32.3M
  bf1[16] = bf0[16];
1554
32.3M
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1555
32.3M
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1556
32.3M
  bf1[19] = bf0[19];
1557
32.3M
  bf1[20] = bf0[20];
1558
32.3M
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1559
32.3M
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1560
32.3M
  bf1[23] = bf0[23];
1561
32.3M
  bf1[24] = bf0[24];
1562
32.3M
  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1563
32.3M
  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1564
32.3M
  bf1[27] = bf0[27];
1565
32.3M
  bf1[28] = bf0[28];
1566
32.3M
  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1567
32.3M
  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1568
32.3M
  bf1[31] = bf0[31];
1569
32.3M
  bf1[32] = bf0[32] + bf0[35];
1570
32.3M
  bf1[33] = bf0[33] + bf0[34];
1571
32.3M
  bf1[34] = -bf0[34] + bf0[33];
1572
32.3M
  bf1[35] = -bf0[35] + bf0[32];
1573
32.3M
  bf1[36] = -bf0[36] + bf0[39];
1574
32.3M
  bf1[37] = -bf0[37] + bf0[38];
1575
32.3M
  bf1[38] = bf0[38] + bf0[37];
1576
32.3M
  bf1[39] = bf0[39] + bf0[36];
1577
32.3M
  bf1[40] = bf0[40] + bf0[43];
1578
32.3M
  bf1[41] = bf0[41] + bf0[42];
1579
32.3M
  bf1[42] = -bf0[42] + bf0[41];
1580
32.3M
  bf1[43] = -bf0[43] + bf0[40];
1581
32.3M
  bf1[44] = -bf0[44] + bf0[47];
1582
32.3M
  bf1[45] = -bf0[45] + bf0[46];
1583
32.3M
  bf1[46] = bf0[46] + bf0[45];
1584
32.3M
  bf1[47] = bf0[47] + bf0[44];
1585
32.3M
  bf1[48] = bf0[48] + bf0[51];
1586
32.3M
  bf1[49] = bf0[49] + bf0[50];
1587
32.3M
  bf1[50] = -bf0[50] + bf0[49];
1588
32.3M
  bf1[51] = -bf0[51] + bf0[48];
1589
32.3M
  bf1[52] = -bf0[52] + bf0[55];
1590
32.3M
  bf1[53] = -bf0[53] + bf0[54];
1591
32.3M
  bf1[54] = bf0[54] + bf0[53];
1592
32.3M
  bf1[55] = bf0[55] + bf0[52];
1593
32.3M
  bf1[56] = bf0[56] + bf0[59];
1594
32.3M
  bf1[57] = bf0[57] + bf0[58];
1595
32.3M
  bf1[58] = -bf0[58] + bf0[57];
1596
32.3M
  bf1[59] = -bf0[59] + bf0[56];
1597
32.3M
  bf1[60] = -bf0[60] + bf0[63];
1598
32.3M
  bf1[61] = -bf0[61] + bf0[62];
1599
32.3M
  bf1[62] = bf0[62] + bf0[61];
1600
32.3M
  bf1[63] = bf0[63] + bf0[60];
1601
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1602
1603
  // stage 8
1604
32.3M
  stage++;
1605
32.3M
  cospi = cospi_arr(cos_bit);
1606
32.3M
  bf0 = output;
1607
32.3M
  bf1 = step;
1608
32.3M
  bf1[0] = bf0[0];
1609
32.3M
  bf1[1] = bf0[1];
1610
32.3M
  bf1[2] = bf0[2];
1611
32.3M
  bf1[3] = bf0[3];
1612
32.3M
  bf1[4] = bf0[4];
1613
32.3M
  bf1[5] = bf0[5];
1614
32.3M
  bf1[6] = bf0[6];
1615
32.3M
  bf1[7] = bf0[7];
1616
32.3M
  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1617
32.3M
  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1618
32.3M
  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1619
32.3M
  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1620
32.3M
  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1621
32.3M
  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1622
32.3M
  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1623
32.3M
  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1624
32.3M
  bf1[16] = bf0[16] + bf0[17];
1625
32.3M
  bf1[17] = -bf0[17] + bf0[16];
1626
32.3M
  bf1[18] = -bf0[18] + bf0[19];
1627
32.3M
  bf1[19] = bf0[19] + bf0[18];
1628
32.3M
  bf1[20] = bf0[20] + bf0[21];
1629
32.3M
  bf1[21] = -bf0[21] + bf0[20];
1630
32.3M
  bf1[22] = -bf0[22] + bf0[23];
1631
32.3M
  bf1[23] = bf0[23] + bf0[22];
1632
32.3M
  bf1[24] = bf0[24] + bf0[25];
1633
32.3M
  bf1[25] = -bf0[25] + bf0[24];
1634
32.3M
  bf1[26] = -bf0[26] + bf0[27];
1635
32.3M
  bf1[27] = bf0[27] + bf0[26];
1636
32.3M
  bf1[28] = bf0[28] + bf0[29];
1637
32.3M
  bf1[29] = -bf0[29] + bf0[28];
1638
32.3M
  bf1[30] = -bf0[30] + bf0[31];
1639
32.3M
  bf1[31] = bf0[31] + bf0[30];
1640
32.3M
  bf1[32] = bf0[32];
1641
32.3M
  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1642
32.3M
  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1643
32.3M
  bf1[35] = bf0[35];
1644
32.3M
  bf1[36] = bf0[36];
1645
32.3M
  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1646
32.3M
  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1647
32.3M
  bf1[39] = bf0[39];
1648
32.3M
  bf1[40] = bf0[40];
1649
32.3M
  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1650
32.3M
  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1651
32.3M
  bf1[43] = bf0[43];
1652
32.3M
  bf1[44] = bf0[44];
1653
32.3M
  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1654
32.3M
  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1655
32.3M
  bf1[47] = bf0[47];
1656
32.3M
  bf1[48] = bf0[48];
1657
32.3M
  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1658
32.3M
  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1659
32.3M
  bf1[51] = bf0[51];
1660
32.3M
  bf1[52] = bf0[52];
1661
32.3M
  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1662
32.3M
  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1663
32.3M
  bf1[55] = bf0[55];
1664
32.3M
  bf1[56] = bf0[56];
1665
32.3M
  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1666
32.3M
  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1667
32.3M
  bf1[59] = bf0[59];
1668
32.3M
  bf1[60] = bf0[60];
1669
32.3M
  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1670
32.3M
  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1671
32.3M
  bf1[63] = bf0[63];
1672
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1673
1674
  // stage 9
1675
32.3M
  stage++;
1676
32.3M
  cospi = cospi_arr(cos_bit);
1677
32.3M
  bf0 = step;
1678
32.3M
  bf1 = output;
1679
32.3M
  bf1[0] = bf0[0];
1680
32.3M
  bf1[1] = bf0[1];
1681
32.3M
  bf1[2] = bf0[2];
1682
32.3M
  bf1[3] = bf0[3];
1683
32.3M
  bf1[4] = bf0[4];
1684
32.3M
  bf1[5] = bf0[5];
1685
32.3M
  bf1[6] = bf0[6];
1686
32.3M
  bf1[7] = bf0[7];
1687
32.3M
  bf1[8] = bf0[8];
1688
32.3M
  bf1[9] = bf0[9];
1689
32.3M
  bf1[10] = bf0[10];
1690
32.3M
  bf1[11] = bf0[11];
1691
32.3M
  bf1[12] = bf0[12];
1692
32.3M
  bf1[13] = bf0[13];
1693
32.3M
  bf1[14] = bf0[14];
1694
32.3M
  bf1[15] = bf0[15];
1695
32.3M
  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1696
32.3M
  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1697
32.3M
  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1698
32.3M
  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1699
32.3M
  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1700
32.3M
  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1701
32.3M
  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1702
32.3M
  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1703
32.3M
  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1704
32.3M
  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1705
32.3M
  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1706
32.3M
  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1707
32.3M
  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1708
32.3M
  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1709
32.3M
  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1710
32.3M
  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1711
32.3M
  bf1[32] = bf0[32] + bf0[33];
1712
32.3M
  bf1[33] = -bf0[33] + bf0[32];
1713
32.3M
  bf1[34] = -bf0[34] + bf0[35];
1714
32.3M
  bf1[35] = bf0[35] + bf0[34];
1715
32.3M
  bf1[36] = bf0[36] + bf0[37];
1716
32.3M
  bf1[37] = -bf0[37] + bf0[36];
1717
32.3M
  bf1[38] = -bf0[38] + bf0[39];
1718
32.3M
  bf1[39] = bf0[39] + bf0[38];
1719
32.3M
  bf1[40] = bf0[40] + bf0[41];
1720
32.3M
  bf1[41] = -bf0[41] + bf0[40];
1721
32.3M
  bf1[42] = -bf0[42] + bf0[43];
1722
32.3M
  bf1[43] = bf0[43] + bf0[42];
1723
32.3M
  bf1[44] = bf0[44] + bf0[45];
1724
32.3M
  bf1[45] = -bf0[45] + bf0[44];
1725
32.3M
  bf1[46] = -bf0[46] + bf0[47];
1726
32.3M
  bf1[47] = bf0[47] + bf0[46];
1727
32.3M
  bf1[48] = bf0[48] + bf0[49];
1728
32.3M
  bf1[49] = -bf0[49] + bf0[48];
1729
32.3M
  bf1[50] = -bf0[50] + bf0[51];
1730
32.3M
  bf1[51] = bf0[51] + bf0[50];
1731
32.3M
  bf1[52] = bf0[52] + bf0[53];
1732
32.3M
  bf1[53] = -bf0[53] + bf0[52];
1733
32.3M
  bf1[54] = -bf0[54] + bf0[55];
1734
32.3M
  bf1[55] = bf0[55] + bf0[54];
1735
32.3M
  bf1[56] = bf0[56] + bf0[57];
1736
32.3M
  bf1[57] = -bf0[57] + bf0[56];
1737
32.3M
  bf1[58] = -bf0[58] + bf0[59];
1738
32.3M
  bf1[59] = bf0[59] + bf0[58];
1739
32.3M
  bf1[60] = bf0[60] + bf0[61];
1740
32.3M
  bf1[61] = -bf0[61] + bf0[60];
1741
32.3M
  bf1[62] = -bf0[62] + bf0[63];
1742
32.3M
  bf1[63] = bf0[63] + bf0[62];
1743
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1744
1745
  // stage 10
1746
32.3M
  stage++;
1747
32.3M
  cospi = cospi_arr(cos_bit);
1748
32.3M
  bf0 = output;
1749
32.3M
  bf1 = step;
1750
32.3M
  bf1[0] = bf0[0];
1751
32.3M
  bf1[1] = bf0[1];
1752
32.3M
  bf1[2] = bf0[2];
1753
32.3M
  bf1[3] = bf0[3];
1754
32.3M
  bf1[4] = bf0[4];
1755
32.3M
  bf1[5] = bf0[5];
1756
32.3M
  bf1[6] = bf0[6];
1757
32.3M
  bf1[7] = bf0[7];
1758
32.3M
  bf1[8] = bf0[8];
1759
32.3M
  bf1[9] = bf0[9];
1760
32.3M
  bf1[10] = bf0[10];
1761
32.3M
  bf1[11] = bf0[11];
1762
32.3M
  bf1[12] = bf0[12];
1763
32.3M
  bf1[13] = bf0[13];
1764
32.3M
  bf1[14] = bf0[14];
1765
32.3M
  bf1[15] = bf0[15];
1766
32.3M
  bf1[16] = bf0[16];
1767
32.3M
  bf1[17] = bf0[17];
1768
32.3M
  bf1[18] = bf0[18];
1769
32.3M
  bf1[19] = bf0[19];
1770
32.3M
  bf1[20] = bf0[20];
1771
32.3M
  bf1[21] = bf0[21];
1772
32.3M
  bf1[22] = bf0[22];
1773
32.3M
  bf1[23] = bf0[23];
1774
32.3M
  bf1[24] = bf0[24];
1775
32.3M
  bf1[25] = bf0[25];
1776
32.3M
  bf1[26] = bf0[26];
1777
32.3M
  bf1[27] = bf0[27];
1778
32.3M
  bf1[28] = bf0[28];
1779
32.3M
  bf1[29] = bf0[29];
1780
32.3M
  bf1[30] = bf0[30];
1781
32.3M
  bf1[31] = bf0[31];
1782
32.3M
  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1783
32.3M
  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1784
32.3M
  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1785
32.3M
  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1786
32.3M
  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1787
32.3M
  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1788
32.3M
  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1789
32.3M
  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1790
32.3M
  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1791
32.3M
  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1792
32.3M
  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1793
32.3M
  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1794
32.3M
  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1795
32.3M
  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1796
32.3M
  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1797
32.3M
  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1798
32.3M
  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1799
32.3M
  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1800
32.3M
  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1801
32.3M
  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1802
32.3M
  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1803
32.3M
  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1804
32.3M
  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1805
32.3M
  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1806
32.3M
  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1807
32.3M
  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1808
32.3M
  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1809
32.3M
  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1810
32.3M
  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1811
32.3M
  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1812
32.3M
  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1813
32.3M
  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1814
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1815
1816
  // stage 11
1817
32.3M
  stage++;
1818
32.3M
  bf0 = step;
1819
32.3M
  bf1 = output;
1820
32.3M
  bf1[0] = bf0[0];
1821
32.3M
  bf1[1] = bf0[32];
1822
32.3M
  bf1[2] = bf0[16];
1823
32.3M
  bf1[3] = bf0[48];
1824
32.3M
  bf1[4] = bf0[8];
1825
32.3M
  bf1[5] = bf0[40];
1826
32.3M
  bf1[6] = bf0[24];
1827
32.3M
  bf1[7] = bf0[56];
1828
32.3M
  bf1[8] = bf0[4];
1829
32.3M
  bf1[9] = bf0[36];
1830
32.3M
  bf1[10] = bf0[20];
1831
32.3M
  bf1[11] = bf0[52];
1832
32.3M
  bf1[12] = bf0[12];
1833
32.3M
  bf1[13] = bf0[44];
1834
32.3M
  bf1[14] = bf0[28];
1835
32.3M
  bf1[15] = bf0[60];
1836
32.3M
  bf1[16] = bf0[2];
1837
32.3M
  bf1[17] = bf0[34];
1838
32.3M
  bf1[18] = bf0[18];
1839
32.3M
  bf1[19] = bf0[50];
1840
32.3M
  bf1[20] = bf0[10];
1841
32.3M
  bf1[21] = bf0[42];
1842
32.3M
  bf1[22] = bf0[26];
1843
32.3M
  bf1[23] = bf0[58];
1844
32.3M
  bf1[24] = bf0[6];
1845
32.3M
  bf1[25] = bf0[38];
1846
32.3M
  bf1[26] = bf0[22];
1847
32.3M
  bf1[27] = bf0[54];
1848
32.3M
  bf1[28] = bf0[14];
1849
32.3M
  bf1[29] = bf0[46];
1850
32.3M
  bf1[30] = bf0[30];
1851
32.3M
  bf1[31] = bf0[62];
1852
32.3M
  bf1[32] = bf0[1];
1853
32.3M
  bf1[33] = bf0[33];
1854
32.3M
  bf1[34] = bf0[17];
1855
32.3M
  bf1[35] = bf0[49];
1856
32.3M
  bf1[36] = bf0[9];
1857
32.3M
  bf1[37] = bf0[41];
1858
32.3M
  bf1[38] = bf0[25];
1859
32.3M
  bf1[39] = bf0[57];
1860
32.3M
  bf1[40] = bf0[5];
1861
32.3M
  bf1[41] = bf0[37];
1862
32.3M
  bf1[42] = bf0[21];
1863
32.3M
  bf1[43] = bf0[53];
1864
32.3M
  bf1[44] = bf0[13];
1865
32.3M
  bf1[45] = bf0[45];
1866
32.3M
  bf1[46] = bf0[29];
1867
32.3M
  bf1[47] = bf0[61];
1868
32.3M
  bf1[48] = bf0[3];
1869
32.3M
  bf1[49] = bf0[35];
1870
32.3M
  bf1[50] = bf0[19];
1871
32.3M
  bf1[51] = bf0[51];
1872
32.3M
  bf1[52] = bf0[11];
1873
32.3M
  bf1[53] = bf0[43];
1874
32.3M
  bf1[54] = bf0[27];
1875
32.3M
  bf1[55] = bf0[59];
1876
32.3M
  bf1[56] = bf0[7];
1877
32.3M
  bf1[57] = bf0[39];
1878
32.3M
  bf1[58] = bf0[23];
1879
32.3M
  bf1[59] = bf0[55];
1880
32.3M
  bf1[60] = bf0[15];
1881
32.3M
  bf1[61] = bf0[47];
1882
32.3M
  bf1[62] = bf0[31];
1883
32.3M
  bf1[63] = bf0[63];
1884
32.3M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1885
32.3M
}