Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/av1_inv_txfm1d.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <stdlib.h>
13
#include "av1/common/av1_inv_txfm1d.h"
14
#include "av1/common/av1_txfm.h"
15
16
void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
17
15.4M
               const int8_t *stage_range) {
18
15.4M
  assert(output != input);
19
15.4M
  const int32_t size = 4;
20
15.4M
  const int32_t *cospi = cospi_arr(cos_bit);
21
22
15.4M
  int32_t stage = 0;
23
15.4M
  int32_t *bf0, *bf1;
24
15.4M
  int32_t step[4];
25
26
  // stage 0;
27
28
  // stage 1;
29
15.4M
  stage++;
30
15.4M
  bf1 = output;
31
15.4M
  bf1[0] = input[0];
32
15.4M
  bf1[1] = input[2];
33
15.4M
  bf1[2] = input[1];
34
15.4M
  bf1[3] = input[3];
35
15.4M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36
37
  // stage 2
38
15.4M
  stage++;
39
15.4M
  bf0 = output;
40
15.4M
  bf1 = step;
41
15.4M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
42
15.4M
  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
43
15.4M
  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
44
15.4M
  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
45
15.4M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
46
47
  // stage 3
48
15.4M
  stage++;
49
15.4M
  bf0 = step;
50
15.4M
  bf1 = output;
51
15.4M
  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
52
15.4M
  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
53
15.4M
  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
54
15.4M
  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
55
15.4M
}
56
57
void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
58
35.9M
               const int8_t *stage_range) {
59
35.9M
  assert(output != input);
60
35.9M
  const int32_t size = 8;
61
35.9M
  const int32_t *cospi = cospi_arr(cos_bit);
62
63
35.9M
  int32_t stage = 0;
64
35.9M
  int32_t *bf0, *bf1;
65
35.9M
  int32_t step[8];
66
67
  // stage 0;
68
69
  // stage 1;
70
35.9M
  stage++;
71
35.9M
  bf1 = output;
72
35.9M
  bf1[0] = input[0];
73
35.9M
  bf1[1] = input[4];
74
35.9M
  bf1[2] = input[2];
75
35.9M
  bf1[3] = input[6];
76
35.9M
  bf1[4] = input[1];
77
35.9M
  bf1[5] = input[5];
78
35.9M
  bf1[6] = input[3];
79
35.9M
  bf1[7] = input[7];
80
35.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
81
82
  // stage 2
83
35.9M
  stage++;
84
35.9M
  bf0 = output;
85
35.9M
  bf1 = step;
86
35.9M
  bf1[0] = bf0[0];
87
35.9M
  bf1[1] = bf0[1];
88
35.9M
  bf1[2] = bf0[2];
89
35.9M
  bf1[3] = bf0[3];
90
35.9M
  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
91
35.9M
  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
92
35.9M
  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
93
35.9M
  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
94
35.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
95
96
  // stage 3
97
35.9M
  stage++;
98
35.9M
  bf0 = step;
99
35.9M
  bf1 = output;
100
35.9M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
101
35.9M
  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
102
35.9M
  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
103
35.9M
  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
104
35.9M
  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
105
35.9M
  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
106
35.9M
  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
107
35.9M
  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
108
35.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
109
110
  // stage 4
111
35.9M
  stage++;
112
35.9M
  bf0 = output;
113
35.9M
  bf1 = step;
114
35.9M
  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
115
35.9M
  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
116
35.9M
  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
117
35.9M
  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
118
35.9M
  bf1[4] = bf0[4];
119
35.9M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
120
35.9M
  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
121
35.9M
  bf1[7] = bf0[7];
122
35.9M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
123
124
  // stage 5
125
35.9M
  stage++;
126
35.9M
  bf0 = step;
127
35.9M
  bf1 = output;
128
35.9M
  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
129
35.9M
  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
130
35.9M
  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
131
35.9M
  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
132
35.9M
  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
133
35.9M
  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
134
35.9M
  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
135
35.9M
  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
136
35.9M
}
137
138
void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
139
34.5M
                const int8_t *stage_range) {
140
34.5M
  assert(output != input);
141
34.5M
  const int32_t size = 16;
142
34.5M
  const int32_t *cospi = cospi_arr(cos_bit);
143
144
34.5M
  int32_t stage = 0;
145
34.5M
  int32_t *bf0, *bf1;
146
34.5M
  int32_t step[16];
147
148
  // stage 0;
149
150
  // stage 1;
151
34.5M
  stage++;
152
34.5M
  bf1 = output;
153
34.5M
  bf1[0] = input[0];
154
34.5M
  bf1[1] = input[8];
155
34.5M
  bf1[2] = input[4];
156
34.5M
  bf1[3] = input[12];
157
34.5M
  bf1[4] = input[2];
158
34.5M
  bf1[5] = input[10];
159
34.5M
  bf1[6] = input[6];
160
34.5M
  bf1[7] = input[14];
161
34.5M
  bf1[8] = input[1];
162
34.5M
  bf1[9] = input[9];
163
34.5M
  bf1[10] = input[5];
164
34.5M
  bf1[11] = input[13];
165
34.5M
  bf1[12] = input[3];
166
34.5M
  bf1[13] = input[11];
167
34.5M
  bf1[14] = input[7];
168
34.5M
  bf1[15] = input[15];
169
34.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
170
171
  // stage 2
172
34.5M
  stage++;
173
34.5M
  bf0 = output;
174
34.5M
  bf1 = step;
175
34.5M
  bf1[0] = bf0[0];
176
34.5M
  bf1[1] = bf0[1];
177
34.5M
  bf1[2] = bf0[2];
178
34.5M
  bf1[3] = bf0[3];
179
34.5M
  bf1[4] = bf0[4];
180
34.5M
  bf1[5] = bf0[5];
181
34.5M
  bf1[6] = bf0[6];
182
34.5M
  bf1[7] = bf0[7];
183
34.5M
  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
184
34.5M
  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
185
34.5M
  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
186
34.5M
  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
187
34.5M
  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
188
34.5M
  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
189
34.5M
  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
190
34.5M
  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
191
34.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
192
193
  // stage 3
194
34.5M
  stage++;
195
34.5M
  bf0 = step;
196
34.5M
  bf1 = output;
197
34.5M
  bf1[0] = bf0[0];
198
34.5M
  bf1[1] = bf0[1];
199
34.5M
  bf1[2] = bf0[2];
200
34.5M
  bf1[3] = bf0[3];
201
34.5M
  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
202
34.5M
  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
203
34.5M
  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
204
34.5M
  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
205
34.5M
  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
206
34.5M
  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
207
34.5M
  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
208
34.5M
  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
209
34.5M
  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
210
34.5M
  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
211
34.5M
  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
212
34.5M
  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
213
34.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
214
215
  // stage 4
216
34.5M
  stage++;
217
34.5M
  bf0 = output;
218
34.5M
  bf1 = step;
219
34.5M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
220
34.5M
  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
221
34.5M
  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
222
34.5M
  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
223
34.5M
  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
224
34.5M
  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
225
34.5M
  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
226
34.5M
  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
227
34.5M
  bf1[8] = bf0[8];
228
34.5M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
229
34.5M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
230
34.5M
  bf1[11] = bf0[11];
231
34.5M
  bf1[12] = bf0[12];
232
34.5M
  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
233
34.5M
  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
234
34.5M
  bf1[15] = bf0[15];
235
34.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
236
237
  // stage 5
238
34.5M
  stage++;
239
34.5M
  bf0 = step;
240
34.5M
  bf1 = output;
241
34.5M
  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
242
34.5M
  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
243
34.5M
  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
244
34.5M
  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
245
34.5M
  bf1[4] = bf0[4];
246
34.5M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
247
34.5M
  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
248
34.5M
  bf1[7] = bf0[7];
249
34.5M
  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
250
34.5M
  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
251
34.5M
  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
252
34.5M
  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
253
34.5M
  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
254
34.5M
  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
255
34.5M
  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
256
34.5M
  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
257
34.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
258
259
  // stage 6
260
34.5M
  stage++;
261
34.5M
  bf0 = output;
262
34.5M
  bf1 = step;
263
34.5M
  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
264
34.5M
  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
265
34.5M
  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
266
34.5M
  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
267
34.5M
  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
268
34.5M
  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
269
34.5M
  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
270
34.5M
  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
271
34.5M
  bf1[8] = bf0[8];
272
34.5M
  bf1[9] = bf0[9];
273
34.5M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
274
34.5M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
275
34.5M
  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
276
34.5M
  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
277
34.5M
  bf1[14] = bf0[14];
278
34.5M
  bf1[15] = bf0[15];
279
34.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
280
281
  // stage 7
282
34.5M
  stage++;
283
34.5M
  bf0 = step;
284
34.5M
  bf1 = output;
285
34.5M
  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
286
34.5M
  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
287
34.5M
  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
288
34.5M
  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
289
34.5M
  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
290
34.5M
  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
291
34.5M
  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
292
34.5M
  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
293
34.5M
  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
294
34.5M
  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
295
34.5M
  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
296
34.5M
  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
297
34.5M
  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
298
34.5M
  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
299
34.5M
  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
300
34.5M
  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
301
34.5M
}
302
303
void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
304
62.5M
                const int8_t *stage_range) {
305
62.5M
  assert(output != input);
306
62.5M
  const int32_t size = 32;
307
62.5M
  const int32_t *cospi = cospi_arr(cos_bit);
308
309
62.5M
  int32_t stage = 0;
310
62.5M
  int32_t *bf0, *bf1;
311
62.5M
  int32_t step[32];
312
313
  // stage 0;
314
315
  // stage 1;
316
62.5M
  stage++;
317
62.5M
  bf1 = output;
318
62.5M
  bf1[0] = input[0];
319
62.5M
  bf1[1] = input[16];
320
62.5M
  bf1[2] = input[8];
321
62.5M
  bf1[3] = input[24];
322
62.5M
  bf1[4] = input[4];
323
62.5M
  bf1[5] = input[20];
324
62.5M
  bf1[6] = input[12];
325
62.5M
  bf1[7] = input[28];
326
62.5M
  bf1[8] = input[2];
327
62.5M
  bf1[9] = input[18];
328
62.5M
  bf1[10] = input[10];
329
62.5M
  bf1[11] = input[26];
330
62.5M
  bf1[12] = input[6];
331
62.5M
  bf1[13] = input[22];
332
62.5M
  bf1[14] = input[14];
333
62.5M
  bf1[15] = input[30];
334
62.5M
  bf1[16] = input[1];
335
62.5M
  bf1[17] = input[17];
336
62.5M
  bf1[18] = input[9];
337
62.5M
  bf1[19] = input[25];
338
62.5M
  bf1[20] = input[5];
339
62.5M
  bf1[21] = input[21];
340
62.5M
  bf1[22] = input[13];
341
62.5M
  bf1[23] = input[29];
342
62.5M
  bf1[24] = input[3];
343
62.5M
  bf1[25] = input[19];
344
62.5M
  bf1[26] = input[11];
345
62.5M
  bf1[27] = input[27];
346
62.5M
  bf1[28] = input[7];
347
62.5M
  bf1[29] = input[23];
348
62.5M
  bf1[30] = input[15];
349
62.5M
  bf1[31] = input[31];
350
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
351
352
  // stage 2
353
62.5M
  stage++;
354
62.5M
  bf0 = output;
355
62.5M
  bf1 = step;
356
62.5M
  bf1[0] = bf0[0];
357
62.5M
  bf1[1] = bf0[1];
358
62.5M
  bf1[2] = bf0[2];
359
62.5M
  bf1[3] = bf0[3];
360
62.5M
  bf1[4] = bf0[4];
361
62.5M
  bf1[5] = bf0[5];
362
62.5M
  bf1[6] = bf0[6];
363
62.5M
  bf1[7] = bf0[7];
364
62.5M
  bf1[8] = bf0[8];
365
62.5M
  bf1[9] = bf0[9];
366
62.5M
  bf1[10] = bf0[10];
367
62.5M
  bf1[11] = bf0[11];
368
62.5M
  bf1[12] = bf0[12];
369
62.5M
  bf1[13] = bf0[13];
370
62.5M
  bf1[14] = bf0[14];
371
62.5M
  bf1[15] = bf0[15];
372
62.5M
  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
373
62.5M
  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
374
62.5M
  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
375
62.5M
  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
376
62.5M
  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
377
62.5M
  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
378
62.5M
  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
379
62.5M
  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
380
62.5M
  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
381
62.5M
  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
382
62.5M
  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
383
62.5M
  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
384
62.5M
  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
385
62.5M
  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
386
62.5M
  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
387
62.5M
  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
388
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
389
390
  // stage 3
391
62.5M
  stage++;
392
62.5M
  bf0 = step;
393
62.5M
  bf1 = output;
394
62.5M
  bf1[0] = bf0[0];
395
62.5M
  bf1[1] = bf0[1];
396
62.5M
  bf1[2] = bf0[2];
397
62.5M
  bf1[3] = bf0[3];
398
62.5M
  bf1[4] = bf0[4];
399
62.5M
  bf1[5] = bf0[5];
400
62.5M
  bf1[6] = bf0[6];
401
62.5M
  bf1[7] = bf0[7];
402
62.5M
  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
403
62.5M
  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
404
62.5M
  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
405
62.5M
  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
406
62.5M
  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
407
62.5M
  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
408
62.5M
  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
409
62.5M
  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
410
62.5M
  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
411
62.5M
  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
412
62.5M
  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
413
62.5M
  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
414
62.5M
  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
415
62.5M
  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
416
62.5M
  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
417
62.5M
  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
418
62.5M
  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
419
62.5M
  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
420
62.5M
  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
421
62.5M
  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
422
62.5M
  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
423
62.5M
  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
424
62.5M
  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
425
62.5M
  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
426
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
427
428
  // stage 4
429
62.5M
  stage++;
430
62.5M
  bf0 = output;
431
62.5M
  bf1 = step;
432
62.5M
  bf1[0] = bf0[0];
433
62.5M
  bf1[1] = bf0[1];
434
62.5M
  bf1[2] = bf0[2];
435
62.5M
  bf1[3] = bf0[3];
436
62.5M
  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
437
62.5M
  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
438
62.5M
  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
439
62.5M
  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
440
62.5M
  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
441
62.5M
  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
442
62.5M
  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
443
62.5M
  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
444
62.5M
  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
445
62.5M
  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
446
62.5M
  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
447
62.5M
  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
448
62.5M
  bf1[16] = bf0[16];
449
62.5M
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
450
62.5M
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
451
62.5M
  bf1[19] = bf0[19];
452
62.5M
  bf1[20] = bf0[20];
453
62.5M
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
454
62.5M
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
455
62.5M
  bf1[23] = bf0[23];
456
62.5M
  bf1[24] = bf0[24];
457
62.5M
  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
458
62.5M
  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
459
62.5M
  bf1[27] = bf0[27];
460
62.5M
  bf1[28] = bf0[28];
461
62.5M
  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
462
62.5M
  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
463
62.5M
  bf1[31] = bf0[31];
464
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
465
466
  // stage 5
467
62.5M
  stage++;
468
62.5M
  bf0 = step;
469
62.5M
  bf1 = output;
470
62.5M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
471
62.5M
  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
472
62.5M
  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
473
62.5M
  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
474
62.5M
  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
475
62.5M
  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
476
62.5M
  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
477
62.5M
  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
478
62.5M
  bf1[8] = bf0[8];
479
62.5M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
480
62.5M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
481
62.5M
  bf1[11] = bf0[11];
482
62.5M
  bf1[12] = bf0[12];
483
62.5M
  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
484
62.5M
  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
485
62.5M
  bf1[15] = bf0[15];
486
62.5M
  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
487
62.5M
  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
488
62.5M
  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
489
62.5M
  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
490
62.5M
  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
491
62.5M
  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
492
62.5M
  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
493
62.5M
  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
494
62.5M
  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
495
62.5M
  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
496
62.5M
  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
497
62.5M
  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
498
62.5M
  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
499
62.5M
  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
500
62.5M
  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
501
62.5M
  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
502
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
503
504
  // stage 6
505
62.5M
  stage++;
506
62.5M
  bf0 = output;
507
62.5M
  bf1 = step;
508
62.5M
  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
509
62.5M
  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
510
62.5M
  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
511
62.5M
  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
512
62.5M
  bf1[4] = bf0[4];
513
62.5M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
514
62.5M
  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
515
62.5M
  bf1[7] = bf0[7];
516
62.5M
  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
517
62.5M
  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
518
62.5M
  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
519
62.5M
  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
520
62.5M
  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
521
62.5M
  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
522
62.5M
  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
523
62.5M
  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
524
62.5M
  bf1[16] = bf0[16];
525
62.5M
  bf1[17] = bf0[17];
526
62.5M
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
527
62.5M
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
528
62.5M
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
529
62.5M
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
530
62.5M
  bf1[22] = bf0[22];
531
62.5M
  bf1[23] = bf0[23];
532
62.5M
  bf1[24] = bf0[24];
533
62.5M
  bf1[25] = bf0[25];
534
62.5M
  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
535
62.5M
  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
536
62.5M
  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
537
62.5M
  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
538
62.5M
  bf1[30] = bf0[30];
539
62.5M
  bf1[31] = bf0[31];
540
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
541
542
  // stage 7
543
62.5M
  stage++;
544
62.5M
  bf0 = step;
545
62.5M
  bf1 = output;
546
62.5M
  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
547
62.5M
  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
548
62.5M
  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
549
62.5M
  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
550
62.5M
  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
551
62.5M
  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
552
62.5M
  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
553
62.5M
  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
554
62.5M
  bf1[8] = bf0[8];
555
62.5M
  bf1[9] = bf0[9];
556
62.5M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
557
62.5M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
558
62.5M
  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
559
62.5M
  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
560
62.5M
  bf1[14] = bf0[14];
561
62.5M
  bf1[15] = bf0[15];
562
62.5M
  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
563
62.5M
  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
564
62.5M
  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
565
62.5M
  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
566
62.5M
  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
567
62.5M
  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
568
62.5M
  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
569
62.5M
  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
570
62.5M
  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
571
62.5M
  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
572
62.5M
  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
573
62.5M
  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
574
62.5M
  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
575
62.5M
  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
576
62.5M
  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
577
62.5M
  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
578
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
579
580
  // stage 8
581
62.5M
  stage++;
582
62.5M
  bf0 = output;
583
62.5M
  bf1 = step;
584
62.5M
  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
585
62.5M
  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
586
62.5M
  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
587
62.5M
  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
588
62.5M
  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
589
62.5M
  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
590
62.5M
  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
591
62.5M
  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
592
62.5M
  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
593
62.5M
  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
594
62.5M
  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
595
62.5M
  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
596
62.5M
  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
597
62.5M
  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
598
62.5M
  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
599
62.5M
  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
600
62.5M
  bf1[16] = bf0[16];
601
62.5M
  bf1[17] = bf0[17];
602
62.5M
  bf1[18] = bf0[18];
603
62.5M
  bf1[19] = bf0[19];
604
62.5M
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
605
62.5M
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
606
62.5M
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
607
62.5M
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
608
62.5M
  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
609
62.5M
  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
610
62.5M
  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
611
62.5M
  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
612
62.5M
  bf1[28] = bf0[28];
613
62.5M
  bf1[29] = bf0[29];
614
62.5M
  bf1[30] = bf0[30];
615
62.5M
  bf1[31] = bf0[31];
616
62.5M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
617
618
  // stage 9
619
62.5M
  stage++;
620
62.5M
  bf0 = step;
621
62.5M
  bf1 = output;
622
62.5M
  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
623
62.5M
  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
624
62.5M
  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
625
62.5M
  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
626
62.5M
  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
627
62.5M
  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
628
62.5M
  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
629
62.5M
  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
630
62.5M
  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
631
62.5M
  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
632
62.5M
  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
633
62.5M
  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
634
62.5M
  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
635
62.5M
  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
636
62.5M
  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
637
62.5M
  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
638
62.5M
  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
639
62.5M
  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
640
62.5M
  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
641
62.5M
  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
642
62.5M
  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
643
62.5M
  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
644
62.5M
  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
645
62.5M
  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
646
62.5M
  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
647
62.5M
  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
648
62.5M
  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
649
62.5M
  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
650
62.5M
  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
651
62.5M
  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
652
62.5M
  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
653
62.5M
  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
654
62.5M
}
655
656
void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
657
11.4M
                const int8_t *stage_range) {
658
11.4M
  int bit = cos_bit;
659
11.4M
  const int32_t *sinpi = sinpi_arr(bit);
660
11.4M
  int64_t s0, s1, s2, s3, s4, s5, s6, s7;
661
662
11.4M
  int64_t x0 = input[0];
663
11.4M
  int64_t x1 = input[1];
664
11.4M
  int64_t x2 = input[2];
665
11.4M
  int64_t x3 = input[3];
666
667
11.4M
  if (!(x0 | x1 | x2 | x3)) {
668
2.83M
    output[0] = output[1] = output[2] = output[3] = 0;
669
2.83M
    return;
670
2.83M
  }
671
672
11.4M
  assert(sinpi[1] + sinpi[2] == sinpi[4]);
673
674
  // stage 1
675
8.64M
  s0 = range_check_value64(sinpi[1] * x0, stage_range[1] + bit);
676
8.64M
  s1 = range_check_value64(sinpi[2] * x0, stage_range[1] + bit);
677
8.64M
  s2 = range_check_value64(sinpi[3] * x1, stage_range[1] + bit);
678
8.64M
  s3 = range_check_value64(sinpi[4] * x2, stage_range[1] + bit);
679
8.64M
  s4 = range_check_value64(sinpi[1] * x2, stage_range[1] + bit);
680
8.64M
  s5 = range_check_value64(sinpi[2] * x3, stage_range[1] + bit);
681
8.64M
  s6 = range_check_value64(sinpi[4] * x3, stage_range[1] + bit);
682
683
  // stage 2
684
  // NOTICE: (x0 - x2) here may use one extra bit compared to the
685
  // opt_range_row/col specified in av1_gen_inv_stage_range()
686
8.64M
  s7 = range_check_value64((x0 - x2) + x3, stage_range[2]);
687
688
  // stage 3
689
8.64M
  s0 = range_check_value64(s0 + s3, stage_range[3] + bit);
690
8.64M
  s1 = range_check_value64(s1 - s4, stage_range[3] + bit);
691
8.64M
  s3 = range_check_value64(s2, stage_range[3] + bit);
692
8.64M
  s2 = range_check_value64(sinpi[3] * s7, stage_range[3] + bit);
693
694
  // stage 4
695
8.64M
  s0 = range_check_value64(s0 + s5, stage_range[4] + bit);
696
8.64M
  s1 = range_check_value64(s1 - s6, stage_range[4] + bit);
697
698
  // stage 5
699
8.64M
  x0 = range_check_value64(s0 + s3, stage_range[5] + bit);
700
8.64M
  x1 = range_check_value64(s1 + s3, stage_range[5] + bit);
701
8.64M
  x2 = range_check_value64(s2, stage_range[5] + bit);
702
8.64M
  x3 = range_check_value64(s0 + s1, stage_range[5] + bit);
703
704
  // stage 6
705
8.64M
  x3 = range_check_value64(x3 - s3, stage_range[6] + bit);
706
707
8.64M
  output[0] = round_shift(x0, bit);
708
8.64M
  output[1] = round_shift(x1, bit);
709
8.64M
  output[2] = round_shift(x2, bit);
710
8.64M
  output[3] = round_shift(x3, bit);
711
8.64M
}
712
713
void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
714
17.2M
                const int8_t *stage_range) {
715
17.2M
  assert(output != input);
716
17.2M
  const int32_t size = 8;
717
17.2M
  const int32_t *cospi = cospi_arr(cos_bit);
718
719
17.2M
  int32_t stage = 0;
720
17.2M
  int32_t *bf0, *bf1;
721
17.2M
  int32_t step[8];
722
723
  // stage 0;
724
725
  // stage 1;
726
17.2M
  stage++;
727
17.2M
  bf1 = output;
728
17.2M
  bf1[0] = input[7];
729
17.2M
  bf1[1] = input[0];
730
17.2M
  bf1[2] = input[5];
731
17.2M
  bf1[3] = input[2];
732
17.2M
  bf1[4] = input[3];
733
17.2M
  bf1[5] = input[4];
734
17.2M
  bf1[6] = input[1];
735
17.2M
  bf1[7] = input[6];
736
17.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
737
738
  // stage 2
739
17.2M
  stage++;
740
17.2M
  bf0 = output;
741
17.2M
  bf1 = step;
742
17.2M
  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
743
17.2M
  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
744
17.2M
  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
745
17.2M
  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
746
17.2M
  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
747
17.2M
  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
748
17.2M
  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
749
17.2M
  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
750
17.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
751
752
  // stage 3
753
17.2M
  stage++;
754
17.2M
  bf0 = step;
755
17.2M
  bf1 = output;
756
17.2M
  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
757
17.2M
  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
758
17.2M
  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
759
17.2M
  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
760
17.2M
  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
761
17.2M
  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
762
17.2M
  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
763
17.2M
  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
764
17.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
765
766
  // stage 4
767
17.2M
  stage++;
768
17.2M
  bf0 = output;
769
17.2M
  bf1 = step;
770
17.2M
  bf1[0] = bf0[0];
771
17.2M
  bf1[1] = bf0[1];
772
17.2M
  bf1[2] = bf0[2];
773
17.2M
  bf1[3] = bf0[3];
774
17.2M
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
775
17.2M
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
776
17.2M
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
777
17.2M
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
778
17.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
779
780
  // stage 5
781
17.2M
  stage++;
782
17.2M
  bf0 = step;
783
17.2M
  bf1 = output;
784
17.2M
  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
785
17.2M
  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
786
17.2M
  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
787
17.2M
  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
788
17.2M
  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
789
17.2M
  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
790
17.2M
  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
791
17.2M
  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
792
17.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
793
794
  // stage 6
795
17.2M
  stage++;
796
17.2M
  bf0 = output;
797
17.2M
  bf1 = step;
798
17.2M
  bf1[0] = bf0[0];
799
17.2M
  bf1[1] = bf0[1];
800
17.2M
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
801
17.2M
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
802
17.2M
  bf1[4] = bf0[4];
803
17.2M
  bf1[5] = bf0[5];
804
17.2M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
805
17.2M
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
806
17.2M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
807
808
  // stage 7
809
17.2M
  bf0 = step;
810
17.2M
  bf1 = output;
811
17.2M
  bf1[0] = bf0[0];
812
17.2M
  bf1[1] = -bf0[4];
813
17.2M
  bf1[2] = bf0[6];
814
17.2M
  bf1[3] = -bf0[2];
815
17.2M
  bf1[4] = bf0[3];
816
17.2M
  bf1[5] = -bf0[7];
817
17.2M
  bf1[6] = bf0[5];
818
17.2M
  bf1[7] = -bf0[1];
819
17.2M
}
820
821
void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
822
18.6M
                 const int8_t *stage_range) {
823
18.6M
  assert(output != input);
824
18.6M
  const int32_t size = 16;
825
18.6M
  const int32_t *cospi = cospi_arr(cos_bit);
826
827
18.6M
  int32_t stage = 0;
828
18.6M
  int32_t *bf0, *bf1;
829
18.6M
  int32_t step[16];
830
831
  // stage 0;
832
833
  // stage 1;
834
18.6M
  stage++;
835
18.6M
  bf1 = output;
836
18.6M
  bf1[0] = input[15];
837
18.6M
  bf1[1] = input[0];
838
18.6M
  bf1[2] = input[13];
839
18.6M
  bf1[3] = input[2];
840
18.6M
  bf1[4] = input[11];
841
18.6M
  bf1[5] = input[4];
842
18.6M
  bf1[6] = input[9];
843
18.6M
  bf1[7] = input[6];
844
18.6M
  bf1[8] = input[7];
845
18.6M
  bf1[9] = input[8];
846
18.6M
  bf1[10] = input[5];
847
18.6M
  bf1[11] = input[10];
848
18.6M
  bf1[12] = input[3];
849
18.6M
  bf1[13] = input[12];
850
18.6M
  bf1[14] = input[1];
851
18.6M
  bf1[15] = input[14];
852
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
853
854
  // stage 2
855
18.6M
  stage++;
856
18.6M
  bf0 = output;
857
18.6M
  bf1 = step;
858
18.6M
  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
859
18.6M
  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
860
18.6M
  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
861
18.6M
  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
862
18.6M
  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
863
18.6M
  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
864
18.6M
  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
865
18.6M
  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
866
18.6M
  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
867
18.6M
  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
868
18.6M
  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
869
18.6M
  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
870
18.6M
  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
871
18.6M
  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
872
18.6M
  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
873
18.6M
  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
874
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
875
876
  // stage 3
877
18.6M
  stage++;
878
18.6M
  bf0 = step;
879
18.6M
  bf1 = output;
880
18.6M
  bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
881
18.6M
  bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
882
18.6M
  bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
883
18.6M
  bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
884
18.6M
  bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
885
18.6M
  bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
886
18.6M
  bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
887
18.6M
  bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
888
18.6M
  bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
889
18.6M
  bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
890
18.6M
  bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
891
18.6M
  bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
892
18.6M
  bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
893
18.6M
  bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
894
18.6M
  bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
895
18.6M
  bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
896
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
897
898
  // stage 4
899
18.6M
  stage++;
900
18.6M
  bf0 = output;
901
18.6M
  bf1 = step;
902
18.6M
  bf1[0] = bf0[0];
903
18.6M
  bf1[1] = bf0[1];
904
18.6M
  bf1[2] = bf0[2];
905
18.6M
  bf1[3] = bf0[3];
906
18.6M
  bf1[4] = bf0[4];
907
18.6M
  bf1[5] = bf0[5];
908
18.6M
  bf1[6] = bf0[6];
909
18.6M
  bf1[7] = bf0[7];
910
18.6M
  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
911
18.6M
  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
912
18.6M
  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
913
18.6M
  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
914
18.6M
  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
915
18.6M
  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
916
18.6M
  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
917
18.6M
  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
918
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
919
920
  // stage 5
921
18.6M
  stage++;
922
18.6M
  bf0 = step;
923
18.6M
  bf1 = output;
924
18.6M
  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
925
18.6M
  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
926
18.6M
  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
927
18.6M
  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
928
18.6M
  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
929
18.6M
  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
930
18.6M
  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
931
18.6M
  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
932
18.6M
  bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
933
18.6M
  bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
934
18.6M
  bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
935
18.6M
  bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
936
18.6M
  bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
937
18.6M
  bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
938
18.6M
  bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
939
18.6M
  bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
940
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
941
942
  // stage 6
943
18.6M
  stage++;
944
18.6M
  bf0 = output;
945
18.6M
  bf1 = step;
946
18.6M
  bf1[0] = bf0[0];
947
18.6M
  bf1[1] = bf0[1];
948
18.6M
  bf1[2] = bf0[2];
949
18.6M
  bf1[3] = bf0[3];
950
18.6M
  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
951
18.6M
  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
952
18.6M
  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
953
18.6M
  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
954
18.6M
  bf1[8] = bf0[8];
955
18.6M
  bf1[9] = bf0[9];
956
18.6M
  bf1[10] = bf0[10];
957
18.6M
  bf1[11] = bf0[11];
958
18.6M
  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
959
18.6M
  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
960
18.6M
  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
961
18.6M
  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
962
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
963
964
  // stage 7
965
18.6M
  stage++;
966
18.6M
  bf0 = step;
967
18.6M
  bf1 = output;
968
18.6M
  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
969
18.6M
  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
970
18.6M
  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
971
18.6M
  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
972
18.6M
  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
973
18.6M
  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
974
18.6M
  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
975
18.6M
  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
976
18.6M
  bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
977
18.6M
  bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
978
18.6M
  bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
979
18.6M
  bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
980
18.6M
  bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
981
18.6M
  bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
982
18.6M
  bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
983
18.6M
  bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
984
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
985
986
  // stage 8
987
18.6M
  stage++;
988
18.6M
  bf0 = output;
989
18.6M
  bf1 = step;
990
18.6M
  bf1[0] = bf0[0];
991
18.6M
  bf1[1] = bf0[1];
992
18.6M
  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
993
18.6M
  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
994
18.6M
  bf1[4] = bf0[4];
995
18.6M
  bf1[5] = bf0[5];
996
18.6M
  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
997
18.6M
  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
998
18.6M
  bf1[8] = bf0[8];
999
18.6M
  bf1[9] = bf0[9];
1000
18.6M
  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
1001
18.6M
  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
1002
18.6M
  bf1[12] = bf0[12];
1003
18.6M
  bf1[13] = bf0[13];
1004
18.6M
  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
1005
18.6M
  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
1006
18.6M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1007
1008
  // stage 9
1009
18.6M
  bf0 = step;
1010
18.6M
  bf1 = output;
1011
18.6M
  bf1[0] = bf0[0];
1012
18.6M
  bf1[1] = -bf0[8];
1013
18.6M
  bf1[2] = bf0[12];
1014
18.6M
  bf1[3] = -bf0[4];
1015
18.6M
  bf1[4] = bf0[6];
1016
18.6M
  bf1[5] = -bf0[14];
1017
18.6M
  bf1[6] = bf0[10];
1018
18.6M
  bf1[7] = -bf0[2];
1019
18.6M
  bf1[8] = bf0[3];
1020
18.6M
  bf1[9] = -bf0[11];
1021
18.6M
  bf1[10] = bf0[15];
1022
18.6M
  bf1[11] = -bf0[7];
1023
18.6M
  bf1[12] = bf0[5];
1024
18.6M
  bf1[13] = -bf0[13];
1025
18.6M
  bf1[14] = bf0[9];
1026
18.6M
  bf1[15] = -bf0[1];
1027
18.6M
}
1028
1029
void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1030
2.53M
                      const int8_t *stage_range) {
1031
2.53M
  (void)cos_bit;
1032
2.53M
  (void)stage_range;
1033
12.6M
  for (int i = 0; i < 4; ++i) {
1034
10.1M
    output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
1035
10.1M
  }
1036
2.53M
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1037
2.53M
}
1038
1039
void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1040
3.78M
                      const int8_t *stage_range) {
1041
3.78M
  (void)cos_bit;
1042
3.78M
  (void)stage_range;
1043
34.0M
  for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
1044
3.78M
}
1045
1046
void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1047
1.75M
                       const int8_t *stage_range) {
1048
1.75M
  (void)cos_bit;
1049
1.75M
  (void)stage_range;
1050
29.7M
  for (int i = 0; i < 16; ++i)
1051
28.0M
    output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
1052
1.75M
  assert(stage_range[0] + NewSqrt2Bits <= 32);
1053
1.75M
}
1054
1055
void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1056
42.2k
                       const int8_t *stage_range) {
1057
42.2k
  (void)cos_bit;
1058
42.2k
  (void)stage_range;
1059
1.39M
  for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
1060
42.2k
}
1061
1062
void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
1063
23.0M
                const int8_t *stage_range) {
1064
23.0M
  assert(output != input);
1065
23.0M
  const int32_t size = 64;
1066
23.0M
  const int32_t *cospi = cospi_arr(cos_bit);
1067
1068
23.0M
  int32_t stage = 0;
1069
23.0M
  int32_t *bf0, *bf1;
1070
23.0M
  int32_t step[64];
1071
1072
  // stage 0;
1073
1074
  // stage 1;
1075
23.0M
  stage++;
1076
23.0M
  bf1 = output;
1077
23.0M
  bf1[0] = input[0];
1078
23.0M
  bf1[1] = input[32];
1079
23.0M
  bf1[2] = input[16];
1080
23.0M
  bf1[3] = input[48];
1081
23.0M
  bf1[4] = input[8];
1082
23.0M
  bf1[5] = input[40];
1083
23.0M
  bf1[6] = input[24];
1084
23.0M
  bf1[7] = input[56];
1085
23.0M
  bf1[8] = input[4];
1086
23.0M
  bf1[9] = input[36];
1087
23.0M
  bf1[10] = input[20];
1088
23.0M
  bf1[11] = input[52];
1089
23.0M
  bf1[12] = input[12];
1090
23.0M
  bf1[13] = input[44];
1091
23.0M
  bf1[14] = input[28];
1092
23.0M
  bf1[15] = input[60];
1093
23.0M
  bf1[16] = input[2];
1094
23.0M
  bf1[17] = input[34];
1095
23.0M
  bf1[18] = input[18];
1096
23.0M
  bf1[19] = input[50];
1097
23.0M
  bf1[20] = input[10];
1098
23.0M
  bf1[21] = input[42];
1099
23.0M
  bf1[22] = input[26];
1100
23.0M
  bf1[23] = input[58];
1101
23.0M
  bf1[24] = input[6];
1102
23.0M
  bf1[25] = input[38];
1103
23.0M
  bf1[26] = input[22];
1104
23.0M
  bf1[27] = input[54];
1105
23.0M
  bf1[28] = input[14];
1106
23.0M
  bf1[29] = input[46];
1107
23.0M
  bf1[30] = input[30];
1108
23.0M
  bf1[31] = input[62];
1109
23.0M
  bf1[32] = input[1];
1110
23.0M
  bf1[33] = input[33];
1111
23.0M
  bf1[34] = input[17];
1112
23.0M
  bf1[35] = input[49];
1113
23.0M
  bf1[36] = input[9];
1114
23.0M
  bf1[37] = input[41];
1115
23.0M
  bf1[38] = input[25];
1116
23.0M
  bf1[39] = input[57];
1117
23.0M
  bf1[40] = input[5];
1118
23.0M
  bf1[41] = input[37];
1119
23.0M
  bf1[42] = input[21];
1120
23.0M
  bf1[43] = input[53];
1121
23.0M
  bf1[44] = input[13];
1122
23.0M
  bf1[45] = input[45];
1123
23.0M
  bf1[46] = input[29];
1124
23.0M
  bf1[47] = input[61];
1125
23.0M
  bf1[48] = input[3];
1126
23.0M
  bf1[49] = input[35];
1127
23.0M
  bf1[50] = input[19];
1128
23.0M
  bf1[51] = input[51];
1129
23.0M
  bf1[52] = input[11];
1130
23.0M
  bf1[53] = input[43];
1131
23.0M
  bf1[54] = input[27];
1132
23.0M
  bf1[55] = input[59];
1133
23.0M
  bf1[56] = input[7];
1134
23.0M
  bf1[57] = input[39];
1135
23.0M
  bf1[58] = input[23];
1136
23.0M
  bf1[59] = input[55];
1137
23.0M
  bf1[60] = input[15];
1138
23.0M
  bf1[61] = input[47];
1139
23.0M
  bf1[62] = input[31];
1140
23.0M
  bf1[63] = input[63];
1141
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1142
1143
  // stage 2
1144
23.0M
  stage++;
1145
23.0M
  bf0 = output;
1146
23.0M
  bf1 = step;
1147
23.0M
  bf1[0] = bf0[0];
1148
23.0M
  bf1[1] = bf0[1];
1149
23.0M
  bf1[2] = bf0[2];
1150
23.0M
  bf1[3] = bf0[3];
1151
23.0M
  bf1[4] = bf0[4];
1152
23.0M
  bf1[5] = bf0[5];
1153
23.0M
  bf1[6] = bf0[6];
1154
23.0M
  bf1[7] = bf0[7];
1155
23.0M
  bf1[8] = bf0[8];
1156
23.0M
  bf1[9] = bf0[9];
1157
23.0M
  bf1[10] = bf0[10];
1158
23.0M
  bf1[11] = bf0[11];
1159
23.0M
  bf1[12] = bf0[12];
1160
23.0M
  bf1[13] = bf0[13];
1161
23.0M
  bf1[14] = bf0[14];
1162
23.0M
  bf1[15] = bf0[15];
1163
23.0M
  bf1[16] = bf0[16];
1164
23.0M
  bf1[17] = bf0[17];
1165
23.0M
  bf1[18] = bf0[18];
1166
23.0M
  bf1[19] = bf0[19];
1167
23.0M
  bf1[20] = bf0[20];
1168
23.0M
  bf1[21] = bf0[21];
1169
23.0M
  bf1[22] = bf0[22];
1170
23.0M
  bf1[23] = bf0[23];
1171
23.0M
  bf1[24] = bf0[24];
1172
23.0M
  bf1[25] = bf0[25];
1173
23.0M
  bf1[26] = bf0[26];
1174
23.0M
  bf1[27] = bf0[27];
1175
23.0M
  bf1[28] = bf0[28];
1176
23.0M
  bf1[29] = bf0[29];
1177
23.0M
  bf1[30] = bf0[30];
1178
23.0M
  bf1[31] = bf0[31];
1179
23.0M
  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
1180
23.0M
  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
1181
23.0M
  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
1182
23.0M
  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
1183
23.0M
  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
1184
23.0M
  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
1185
23.0M
  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
1186
23.0M
  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
1187
23.0M
  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
1188
23.0M
  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
1189
23.0M
  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
1190
23.0M
  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
1191
23.0M
  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
1192
23.0M
  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
1193
23.0M
  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
1194
23.0M
  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
1195
23.0M
  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
1196
23.0M
  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
1197
23.0M
  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
1198
23.0M
  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
1199
23.0M
  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
1200
23.0M
  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
1201
23.0M
  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
1202
23.0M
  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
1203
23.0M
  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
1204
23.0M
  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
1205
23.0M
  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
1206
23.0M
  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
1207
23.0M
  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
1208
23.0M
  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
1209
23.0M
  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
1210
23.0M
  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
1211
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1212
1213
  // stage 3
1214
23.0M
  stage++;
1215
23.0M
  bf0 = step;
1216
23.0M
  bf1 = output;
1217
23.0M
  bf1[0] = bf0[0];
1218
23.0M
  bf1[1] = bf0[1];
1219
23.0M
  bf1[2] = bf0[2];
1220
23.0M
  bf1[3] = bf0[3];
1221
23.0M
  bf1[4] = bf0[4];
1222
23.0M
  bf1[5] = bf0[5];
1223
23.0M
  bf1[6] = bf0[6];
1224
23.0M
  bf1[7] = bf0[7];
1225
23.0M
  bf1[8] = bf0[8];
1226
23.0M
  bf1[9] = bf0[9];
1227
23.0M
  bf1[10] = bf0[10];
1228
23.0M
  bf1[11] = bf0[11];
1229
23.0M
  bf1[12] = bf0[12];
1230
23.0M
  bf1[13] = bf0[13];
1231
23.0M
  bf1[14] = bf0[14];
1232
23.0M
  bf1[15] = bf0[15];
1233
23.0M
  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
1234
23.0M
  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
1235
23.0M
  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
1236
23.0M
  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
1237
23.0M
  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
1238
23.0M
  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
1239
23.0M
  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
1240
23.0M
  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
1241
23.0M
  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
1242
23.0M
  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
1243
23.0M
  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
1244
23.0M
  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
1245
23.0M
  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
1246
23.0M
  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
1247
23.0M
  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
1248
23.0M
  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
1249
23.0M
  bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
1250
23.0M
  bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
1251
23.0M
  bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
1252
23.0M
  bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
1253
23.0M
  bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
1254
23.0M
  bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
1255
23.0M
  bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
1256
23.0M
  bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
1257
23.0M
  bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
1258
23.0M
  bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
1259
23.0M
  bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
1260
23.0M
  bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
1261
23.0M
  bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
1262
23.0M
  bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
1263
23.0M
  bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
1264
23.0M
  bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
1265
23.0M
  bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
1266
23.0M
  bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
1267
23.0M
  bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
1268
23.0M
  bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
1269
23.0M
  bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
1270
23.0M
  bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
1271
23.0M
  bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
1272
23.0M
  bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
1273
23.0M
  bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
1274
23.0M
  bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
1275
23.0M
  bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
1276
23.0M
  bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
1277
23.0M
  bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
1278
23.0M
  bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
1279
23.0M
  bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
1280
23.0M
  bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
1281
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1282
1283
  // stage 4
1284
23.0M
  stage++;
1285
23.0M
  bf0 = output;
1286
23.0M
  bf1 = step;
1287
23.0M
  bf1[0] = bf0[0];
1288
23.0M
  bf1[1] = bf0[1];
1289
23.0M
  bf1[2] = bf0[2];
1290
23.0M
  bf1[3] = bf0[3];
1291
23.0M
  bf1[4] = bf0[4];
1292
23.0M
  bf1[5] = bf0[5];
1293
23.0M
  bf1[6] = bf0[6];
1294
23.0M
  bf1[7] = bf0[7];
1295
23.0M
  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
1296
23.0M
  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
1297
23.0M
  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
1298
23.0M
  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
1299
23.0M
  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
1300
23.0M
  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
1301
23.0M
  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
1302
23.0M
  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
1303
23.0M
  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
1304
23.0M
  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
1305
23.0M
  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
1306
23.0M
  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
1307
23.0M
  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
1308
23.0M
  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
1309
23.0M
  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
1310
23.0M
  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
1311
23.0M
  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
1312
23.0M
  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
1313
23.0M
  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
1314
23.0M
  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
1315
23.0M
  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
1316
23.0M
  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
1317
23.0M
  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
1318
23.0M
  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
1319
23.0M
  bf1[32] = bf0[32];
1320
23.0M
  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1321
23.0M
  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1322
23.0M
  bf1[35] = bf0[35];
1323
23.0M
  bf1[36] = bf0[36];
1324
23.0M
  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1325
23.0M
  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1326
23.0M
  bf1[39] = bf0[39];
1327
23.0M
  bf1[40] = bf0[40];
1328
23.0M
  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1329
23.0M
  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1330
23.0M
  bf1[43] = bf0[43];
1331
23.0M
  bf1[44] = bf0[44];
1332
23.0M
  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1333
23.0M
  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1334
23.0M
  bf1[47] = bf0[47];
1335
23.0M
  bf1[48] = bf0[48];
1336
23.0M
  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
1337
23.0M
  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
1338
23.0M
  bf1[51] = bf0[51];
1339
23.0M
  bf1[52] = bf0[52];
1340
23.0M
  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
1341
23.0M
  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
1342
23.0M
  bf1[55] = bf0[55];
1343
23.0M
  bf1[56] = bf0[56];
1344
23.0M
  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
1345
23.0M
  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
1346
23.0M
  bf1[59] = bf0[59];
1347
23.0M
  bf1[60] = bf0[60];
1348
23.0M
  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
1349
23.0M
  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
1350
23.0M
  bf1[63] = bf0[63];
1351
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1352
1353
  // stage 5
1354
23.0M
  stage++;
1355
23.0M
  bf0 = step;
1356
23.0M
  bf1 = output;
1357
23.0M
  bf1[0] = bf0[0];
1358
23.0M
  bf1[1] = bf0[1];
1359
23.0M
  bf1[2] = bf0[2];
1360
23.0M
  bf1[3] = bf0[3];
1361
23.0M
  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
1362
23.0M
  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
1363
23.0M
  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
1364
23.0M
  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
1365
23.0M
  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
1366
23.0M
  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
1367
23.0M
  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
1368
23.0M
  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
1369
23.0M
  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
1370
23.0M
  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
1371
23.0M
  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
1372
23.0M
  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
1373
23.0M
  bf1[16] = bf0[16];
1374
23.0M
  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1375
23.0M
  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1376
23.0M
  bf1[19] = bf0[19];
1377
23.0M
  bf1[20] = bf0[20];
1378
23.0M
  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1379
23.0M
  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1380
23.0M
  bf1[23] = bf0[23];
1381
23.0M
  bf1[24] = bf0[24];
1382
23.0M
  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
1383
23.0M
  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
1384
23.0M
  bf1[27] = bf0[27];
1385
23.0M
  bf1[28] = bf0[28];
1386
23.0M
  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
1387
23.0M
  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
1388
23.0M
  bf1[31] = bf0[31];
1389
23.0M
  bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
1390
23.0M
  bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
1391
23.0M
  bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
1392
23.0M
  bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
1393
23.0M
  bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
1394
23.0M
  bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
1395
23.0M
  bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
1396
23.0M
  bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
1397
23.0M
  bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
1398
23.0M
  bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
1399
23.0M
  bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
1400
23.0M
  bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
1401
23.0M
  bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
1402
23.0M
  bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
1403
23.0M
  bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
1404
23.0M
  bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
1405
23.0M
  bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
1406
23.0M
  bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
1407
23.0M
  bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
1408
23.0M
  bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
1409
23.0M
  bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
1410
23.0M
  bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
1411
23.0M
  bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
1412
23.0M
  bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
1413
23.0M
  bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
1414
23.0M
  bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
1415
23.0M
  bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
1416
23.0M
  bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
1417
23.0M
  bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
1418
23.0M
  bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
1419
23.0M
  bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
1420
23.0M
  bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
1421
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1422
1423
  // stage 6
1424
23.0M
  stage++;
1425
23.0M
  bf0 = output;
1426
23.0M
  bf1 = step;
1427
23.0M
  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1428
23.0M
  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
1429
23.0M
  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
1430
23.0M
  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
1431
23.0M
  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
1432
23.0M
  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
1433
23.0M
  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
1434
23.0M
  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
1435
23.0M
  bf1[8] = bf0[8];
1436
23.0M
  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1437
23.0M
  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1438
23.0M
  bf1[11] = bf0[11];
1439
23.0M
  bf1[12] = bf0[12];
1440
23.0M
  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
1441
23.0M
  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
1442
23.0M
  bf1[15] = bf0[15];
1443
23.0M
  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
1444
23.0M
  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
1445
23.0M
  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
1446
23.0M
  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
1447
23.0M
  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
1448
23.0M
  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
1449
23.0M
  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
1450
23.0M
  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
1451
23.0M
  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
1452
23.0M
  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
1453
23.0M
  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
1454
23.0M
  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
1455
23.0M
  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
1456
23.0M
  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
1457
23.0M
  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
1458
23.0M
  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
1459
23.0M
  bf1[32] = bf0[32];
1460
23.0M
  bf1[33] = bf0[33];
1461
23.0M
  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1462
23.0M
  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1463
23.0M
  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1464
23.0M
  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1465
23.0M
  bf1[38] = bf0[38];
1466
23.0M
  bf1[39] = bf0[39];
1467
23.0M
  bf1[40] = bf0[40];
1468
23.0M
  bf1[41] = bf0[41];
1469
23.0M
  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1470
23.0M
  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1471
23.0M
  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1472
23.0M
  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1473
23.0M
  bf1[46] = bf0[46];
1474
23.0M
  bf1[47] = bf0[47];
1475
23.0M
  bf1[48] = bf0[48];
1476
23.0M
  bf1[49] = bf0[49];
1477
23.0M
  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
1478
23.0M
  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
1479
23.0M
  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
1480
23.0M
  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
1481
23.0M
  bf1[54] = bf0[54];
1482
23.0M
  bf1[55] = bf0[55];
1483
23.0M
  bf1[56] = bf0[56];
1484
23.0M
  bf1[57] = bf0[57];
1485
23.0M
  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
1486
23.0M
  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
1487
23.0M
  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
1488
23.0M
  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
1489
23.0M
  bf1[62] = bf0[62];
1490
23.0M
  bf1[63] = bf0[63];
1491
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1492
1493
  // stage 7
1494
23.0M
  stage++;
1495
23.0M
  bf0 = step;
1496
23.0M
  bf1 = output;
1497
23.0M
  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
1498
23.0M
  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
1499
23.0M
  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
1500
23.0M
  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
1501
23.0M
  bf1[4] = bf0[4];
1502
23.0M
  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1503
23.0M
  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1504
23.0M
  bf1[7] = bf0[7];
1505
23.0M
  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
1506
23.0M
  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
1507
23.0M
  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
1508
23.0M
  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
1509
23.0M
  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
1510
23.0M
  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
1511
23.0M
  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
1512
23.0M
  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
1513
23.0M
  bf1[16] = bf0[16];
1514
23.0M
  bf1[17] = bf0[17];
1515
23.0M
  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1516
23.0M
  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1517
23.0M
  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1518
23.0M
  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1519
23.0M
  bf1[22] = bf0[22];
1520
23.0M
  bf1[23] = bf0[23];
1521
23.0M
  bf1[24] = bf0[24];
1522
23.0M
  bf1[25] = bf0[25];
1523
23.0M
  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
1524
23.0M
  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
1525
23.0M
  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
1526
23.0M
  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
1527
23.0M
  bf1[30] = bf0[30];
1528
23.0M
  bf1[31] = bf0[31];
1529
23.0M
  bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
1530
23.0M
  bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
1531
23.0M
  bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
1532
23.0M
  bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
1533
23.0M
  bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
1534
23.0M
  bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
1535
23.0M
  bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
1536
23.0M
  bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
1537
23.0M
  bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
1538
23.0M
  bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
1539
23.0M
  bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
1540
23.0M
  bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
1541
23.0M
  bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
1542
23.0M
  bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
1543
23.0M
  bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
1544
23.0M
  bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
1545
23.0M
  bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
1546
23.0M
  bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
1547
23.0M
  bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
1548
23.0M
  bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
1549
23.0M
  bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
1550
23.0M
  bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
1551
23.0M
  bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
1552
23.0M
  bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
1553
23.0M
  bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
1554
23.0M
  bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
1555
23.0M
  bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
1556
23.0M
  bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
1557
23.0M
  bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
1558
23.0M
  bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
1559
23.0M
  bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
1560
23.0M
  bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
1561
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1562
1563
  // stage 8
1564
23.0M
  stage++;
1565
23.0M
  bf0 = output;
1566
23.0M
  bf1 = step;
1567
23.0M
  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
1568
23.0M
  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
1569
23.0M
  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
1570
23.0M
  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
1571
23.0M
  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
1572
23.0M
  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
1573
23.0M
  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
1574
23.0M
  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
1575
23.0M
  bf1[8] = bf0[8];
1576
23.0M
  bf1[9] = bf0[9];
1577
23.0M
  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1578
23.0M
  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1579
23.0M
  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1580
23.0M
  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1581
23.0M
  bf1[14] = bf0[14];
1582
23.0M
  bf1[15] = bf0[15];
1583
23.0M
  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
1584
23.0M
  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
1585
23.0M
  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
1586
23.0M
  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
1587
23.0M
  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
1588
23.0M
  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
1589
23.0M
  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
1590
23.0M
  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
1591
23.0M
  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
1592
23.0M
  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
1593
23.0M
  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
1594
23.0M
  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
1595
23.0M
  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
1596
23.0M
  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
1597
23.0M
  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
1598
23.0M
  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
1599
23.0M
  bf1[32] = bf0[32];
1600
23.0M
  bf1[33] = bf0[33];
1601
23.0M
  bf1[34] = bf0[34];
1602
23.0M
  bf1[35] = bf0[35];
1603
23.0M
  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1604
23.0M
  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1605
23.0M
  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1606
23.0M
  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1607
23.0M
  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1608
23.0M
  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1609
23.0M
  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1610
23.0M
  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1611
23.0M
  bf1[44] = bf0[44];
1612
23.0M
  bf1[45] = bf0[45];
1613
23.0M
  bf1[46] = bf0[46];
1614
23.0M
  bf1[47] = bf0[47];
1615
23.0M
  bf1[48] = bf0[48];
1616
23.0M
  bf1[49] = bf0[49];
1617
23.0M
  bf1[50] = bf0[50];
1618
23.0M
  bf1[51] = bf0[51];
1619
23.0M
  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
1620
23.0M
  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
1621
23.0M
  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
1622
23.0M
  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
1623
23.0M
  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
1624
23.0M
  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
1625
23.0M
  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
1626
23.0M
  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
1627
23.0M
  bf1[60] = bf0[60];
1628
23.0M
  bf1[61] = bf0[61];
1629
23.0M
  bf1[62] = bf0[62];
1630
23.0M
  bf1[63] = bf0[63];
1631
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1632
1633
  // stage 9
1634
23.0M
  stage++;
1635
23.0M
  bf0 = step;
1636
23.0M
  bf1 = output;
1637
23.0M
  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
1638
23.0M
  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
1639
23.0M
  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
1640
23.0M
  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
1641
23.0M
  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
1642
23.0M
  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
1643
23.0M
  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
1644
23.0M
  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
1645
23.0M
  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
1646
23.0M
  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
1647
23.0M
  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
1648
23.0M
  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
1649
23.0M
  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
1650
23.0M
  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
1651
23.0M
  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
1652
23.0M
  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
1653
23.0M
  bf1[16] = bf0[16];
1654
23.0M
  bf1[17] = bf0[17];
1655
23.0M
  bf1[18] = bf0[18];
1656
23.0M
  bf1[19] = bf0[19];
1657
23.0M
  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1658
23.0M
  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1659
23.0M
  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1660
23.0M
  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1661
23.0M
  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1662
23.0M
  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1663
23.0M
  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1664
23.0M
  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1665
23.0M
  bf1[28] = bf0[28];
1666
23.0M
  bf1[29] = bf0[29];
1667
23.0M
  bf1[30] = bf0[30];
1668
23.0M
  bf1[31] = bf0[31];
1669
23.0M
  bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
1670
23.0M
  bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
1671
23.0M
  bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
1672
23.0M
  bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
1673
23.0M
  bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
1674
23.0M
  bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
1675
23.0M
  bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
1676
23.0M
  bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
1677
23.0M
  bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
1678
23.0M
  bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
1679
23.0M
  bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
1680
23.0M
  bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
1681
23.0M
  bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
1682
23.0M
  bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
1683
23.0M
  bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
1684
23.0M
  bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
1685
23.0M
  bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
1686
23.0M
  bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
1687
23.0M
  bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
1688
23.0M
  bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
1689
23.0M
  bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
1690
23.0M
  bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
1691
23.0M
  bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
1692
23.0M
  bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
1693
23.0M
  bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
1694
23.0M
  bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
1695
23.0M
  bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
1696
23.0M
  bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
1697
23.0M
  bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
1698
23.0M
  bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
1699
23.0M
  bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
1700
23.0M
  bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
1701
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1702
1703
  // stage 10
1704
23.0M
  stage++;
1705
23.0M
  bf0 = output;
1706
23.0M
  bf1 = step;
1707
23.0M
  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
1708
23.0M
  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
1709
23.0M
  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
1710
23.0M
  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
1711
23.0M
  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
1712
23.0M
  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
1713
23.0M
  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
1714
23.0M
  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
1715
23.0M
  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
1716
23.0M
  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
1717
23.0M
  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
1718
23.0M
  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
1719
23.0M
  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
1720
23.0M
  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
1721
23.0M
  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
1722
23.0M
  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
1723
23.0M
  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
1724
23.0M
  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
1725
23.0M
  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
1726
23.0M
  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
1727
23.0M
  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
1728
23.0M
  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
1729
23.0M
  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
1730
23.0M
  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
1731
23.0M
  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
1732
23.0M
  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
1733
23.0M
  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
1734
23.0M
  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
1735
23.0M
  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
1736
23.0M
  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
1737
23.0M
  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
1738
23.0M
  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
1739
23.0M
  bf1[32] = bf0[32];
1740
23.0M
  bf1[33] = bf0[33];
1741
23.0M
  bf1[34] = bf0[34];
1742
23.0M
  bf1[35] = bf0[35];
1743
23.0M
  bf1[36] = bf0[36];
1744
23.0M
  bf1[37] = bf0[37];
1745
23.0M
  bf1[38] = bf0[38];
1746
23.0M
  bf1[39] = bf0[39];
1747
23.0M
  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1748
23.0M
  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1749
23.0M
  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1750
23.0M
  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1751
23.0M
  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1752
23.0M
  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1753
23.0M
  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1754
23.0M
  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1755
23.0M
  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1756
23.0M
  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1757
23.0M
  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1758
23.0M
  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1759
23.0M
  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1760
23.0M
  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1761
23.0M
  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1762
23.0M
  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1763
23.0M
  bf1[56] = bf0[56];
1764
23.0M
  bf1[57] = bf0[57];
1765
23.0M
  bf1[58] = bf0[58];
1766
23.0M
  bf1[59] = bf0[59];
1767
23.0M
  bf1[60] = bf0[60];
1768
23.0M
  bf1[61] = bf0[61];
1769
23.0M
  bf1[62] = bf0[62];
1770
23.0M
  bf1[63] = bf0[63];
1771
23.0M
  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1772
1773
  // stage 11
1774
23.0M
  stage++;
1775
23.0M
  bf0 = step;
1776
23.0M
  bf1 = output;
1777
23.0M
  bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
1778
23.0M
  bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
1779
23.0M
  bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
1780
23.0M
  bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
1781
23.0M
  bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
1782
23.0M
  bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
1783
23.0M
  bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
1784
23.0M
  bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
1785
23.0M
  bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
1786
23.0M
  bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
1787
23.0M
  bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
1788
23.0M
  bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
1789
23.0M
  bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
1790
23.0M
  bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
1791
23.0M
  bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
1792
23.0M
  bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
1793
23.0M
  bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
1794
23.0M
  bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
1795
23.0M
  bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
1796
23.0M
  bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
1797
23.0M
  bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
1798
23.0M
  bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
1799
23.0M
  bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
1800
23.0M
  bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
1801
23.0M
  bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
1802
23.0M
  bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
1803
23.0M
  bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
1804
23.0M
  bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
1805
23.0M
  bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
1806
23.0M
  bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
1807
23.0M
  bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
1808
23.0M
  bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
1809
23.0M
  bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
1810
23.0M
  bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
1811
23.0M
  bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
1812
23.0M
  bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
1813
23.0M
  bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
1814
23.0M
  bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
1815
23.0M
  bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
1816
23.0M
  bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
1817
23.0M
  bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
1818
23.0M
  bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
1819
23.0M
  bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
1820
23.0M
  bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
1821
23.0M
  bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
1822
23.0M
  bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
1823
23.0M
  bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
1824
23.0M
  bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
1825
23.0M
  bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
1826
23.0M
  bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
1827
23.0M
  bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
1828
23.0M
  bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
1829
23.0M
  bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
1830
23.0M
  bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
1831
23.0M
  bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
1832
23.0M
  bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
1833
23.0M
  bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
1834
23.0M
  bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
1835
23.0M
  bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
1836
23.0M
  bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
1837
23.0M
  bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
1838
23.0M
  bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
1839
23.0M
  bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
1840
23.0M
  bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
1841
23.0M
}