Coverage Report

Created: 2024-06-18 06:48

/src/aom/av1/common/x86/highbd_inv_txfm_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
#include <assert.h>
12
#include <immintrin.h>
13
14
#include "config/aom_config.h"
15
#include "config/av1_rtcd.h"
16
17
#include "av1/common/av1_inv_txfm1d_cfg.h"
18
#include "av1/common/idct.h"
19
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
20
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
21
#include "aom_dsp/x86/txfm_common_avx2.h"
22
23
// Note:
24
//  Total 32x4 registers to represent 32x32 block coefficients.
25
//  For high bit depth, each coefficient is 4-byte.
26
//  Each __m256i register holds 8 coefficients.
27
//  So each "row" we needs 4 register. Totally 32 rows
28
//  Register layout:
29
//   v0,   v1,   v2,   v3,
30
//   v4,   v5,   v6,   v7,
31
//   ... ...
32
//   v124, v125, v126, v127
33
34
119M
static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
35
119M
  const __m256i zero = _mm256_setzero_si256();
36
119M
  const __m256i one = _mm256_set1_epi16(1);
37
119M
  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
38
119M
  __m256i clamped, mask;
39
40
119M
  mask = _mm256_cmpgt_epi16(u, max);
41
119M
  clamped = _mm256_andnot_si256(mask, u);
42
119M
  mask = _mm256_and_si256(mask, max);
43
119M
  clamped = _mm256_or_si256(mask, clamped);
44
119M
  mask = _mm256_cmpgt_epi16(clamped, zero);
45
119M
  clamped = _mm256_and_si256(clamped, mask);
46
47
119M
  return clamped;
48
119M
}
49
50
17.5M
static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
51
17.5M
  if (shift != 0) {
52
17.5M
    __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
53
17.5M
    in[0] = _mm256_add_epi32(in[0], rnding);
54
17.5M
    in[1] = _mm256_add_epi32(in[1], rnding);
55
17.5M
    in[2] = _mm256_add_epi32(in[2], rnding);
56
17.5M
    in[3] = _mm256_add_epi32(in[3], rnding);
57
58
17.5M
    in[0] = _mm256_srai_epi32(in[0], shift);
59
17.5M
    in[1] = _mm256_srai_epi32(in[1], shift);
60
17.5M
    in[2] = _mm256_srai_epi32(in[2], shift);
61
17.5M
    in[3] = _mm256_srai_epi32(in[3], shift);
62
17.5M
  }
63
17.5M
}
64
65
4.05M
static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
66
4.05M
  round_shift_4x4_avx2(in, shift);
67
4.05M
  round_shift_4x4_avx2(in + 4, shift);
68
4.05M
  round_shift_4x4_avx2(in + 8, shift);
69
4.05M
  round_shift_4x4_avx2(in + 12, shift);
70
4.05M
}
71
72
static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
73
                                    const __m256i *clamp_lo,
74
2.66M
                                    const __m256i *clamp_hi, int size) {
75
2.66M
  __m256i a0, a1;
76
20.2M
  for (int i = 0; i < size; i += 4) {
77
17.5M
    a0 = _mm256_max_epi32(in[i], *clamp_lo);
78
17.5M
    out[i] = _mm256_min_epi32(a0, *clamp_hi);
79
80
17.5M
    a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
81
17.5M
    out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
82
83
17.5M
    a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
84
17.5M
    out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
85
86
17.5M
    a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
87
17.5M
    out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
88
17.5M
  }
89
2.66M
}
90
91
static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
92
                                                 __m256i res0, __m256i res1,
93
101M
                                                 const int bd) {
94
101M
  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
95
101M
  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
96
97
101M
  x0 = _mm256_add_epi32(res0, x0);
98
101M
  x1 = _mm256_add_epi32(res1, x1);
99
101M
  x0 = _mm256_packus_epi32(x0, x1);
100
101M
  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
101
101M
  x0 = highbd_clamp_epi16_avx2(x0, bd);
102
101M
  return x0;
103
101M
}
104
105
static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
106
                                                 int stride, int flipud,
107
5.00M
                                                 int height, const int bd) {
108
5.00M
  int j = flipud ? (height - 1) : 0;
109
5.00M
  const int step = flipud ? -1 : 1;
110
107M
  for (int i = 0; i < height; ++i, j += step) {
111
102M
    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
112
102M
    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
113
114
102M
    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
115
102M
  }
116
5.00M
}
117
static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
118
17.9M
                                                const int bd) {
119
17.9M
  __m256i x0 = pred;
120
17.9M
  x0 = _mm256_add_epi32(res, x0);
121
17.9M
  x0 = _mm256_packus_epi32(x0, x0);
122
17.9M
  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
123
17.9M
  x0 = highbd_clamp_epi16_avx2(x0, bd);
124
17.9M
  return x0;
125
17.9M
}
126
127
static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
128
                                                int stride, int flipud,
129
1.68M
                                                int height, const int bd) {
130
1.68M
  int j = flipud ? (height - 1) : 0;
131
1.68M
  __m128i temp;
132
1.68M
  const int step = flipud ? -1 : 1;
133
19.6M
  for (int i = 0; i < height; ++i, j += step) {
134
17.9M
    temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
135
17.9M
    __m256i v = _mm256_cvtepi16_epi32(temp);
136
17.9M
    __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
137
17.9M
    __m128i u1 = _mm256_castsi256_si128(u);
138
17.9M
    _mm_storeu_si128((__m128i *)(output + i * stride), u1);
139
17.9M
  }
140
1.68M
}
141
static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
142
                           __m256i *out1, const __m256i *clamp_lo,
143
8.86M
                           const __m256i *clamp_hi, int shift) {
144
8.86M
  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
145
8.86M
  __m256i a0 = _mm256_add_epi32(offset, in0);
146
8.86M
  __m256i a1 = _mm256_sub_epi32(offset, in1);
147
148
8.86M
  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
149
8.86M
  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
150
151
8.86M
  a0 = _mm256_max_epi32(a0, *clamp_lo);
152
8.86M
  a0 = _mm256_min_epi32(a0, *clamp_hi);
153
8.86M
  a1 = _mm256_max_epi32(a1, *clamp_lo);
154
8.86M
  a1 = _mm256_min_epi32(a1, *clamp_hi);
155
156
8.86M
  *out0 = a0;
157
8.86M
  *out1 = a1;
158
8.86M
}
159
160
13.9M
static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
161
13.9M
  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
162
13.9M
  __m256i x0, x1;
163
164
13.9M
  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
165
13.9M
  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
166
167
13.9M
  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
168
13.9M
  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
169
170
13.9M
  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
171
13.9M
  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
172
173
13.9M
  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
174
13.9M
  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
175
176
13.9M
  x0 = _mm256_unpacklo_epi64(u0, u2);
177
13.9M
  x1 = _mm256_unpacklo_epi64(u4, u6);
178
13.9M
  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
179
13.9M
  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
180
181
13.9M
  x0 = _mm256_unpackhi_epi64(u0, u2);
182
13.9M
  x1 = _mm256_unpackhi_epi64(u4, u6);
183
13.9M
  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
184
13.9M
  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
185
186
13.9M
  x0 = _mm256_unpacklo_epi64(u1, u3);
187
13.9M
  x1 = _mm256_unpacklo_epi64(u5, u7);
188
13.9M
  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
189
13.9M
  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
190
191
13.9M
  x0 = _mm256_unpackhi_epi64(u1, u3);
192
13.9M
  x1 = _mm256_unpackhi_epi64(u5, u7);
193
13.9M
  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
194
13.9M
  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
195
13.9M
}
196
197
178k
static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
198
178k
  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
199
178k
  __m256i x0, x1;
200
201
178k
  u0 = _mm256_unpacklo_epi32(in[7], in[6]);
202
178k
  u1 = _mm256_unpackhi_epi32(in[7], in[6]);
203
204
178k
  u2 = _mm256_unpacklo_epi32(in[5], in[4]);
205
178k
  u3 = _mm256_unpackhi_epi32(in[5], in[4]);
206
207
178k
  u4 = _mm256_unpacklo_epi32(in[3], in[2]);
208
178k
  u5 = _mm256_unpackhi_epi32(in[3], in[2]);
209
210
178k
  u6 = _mm256_unpacklo_epi32(in[1], in[0]);
211
178k
  u7 = _mm256_unpackhi_epi32(in[1], in[0]);
212
213
178k
  x0 = _mm256_unpacklo_epi64(u0, u2);
214
178k
  x1 = _mm256_unpacklo_epi64(u4, u6);
215
178k
  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
216
178k
  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
217
218
178k
  x0 = _mm256_unpackhi_epi64(u0, u2);
219
178k
  x1 = _mm256_unpackhi_epi64(u4, u6);
220
178k
  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
221
178k
  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
222
223
178k
  x0 = _mm256_unpacklo_epi64(u1, u3);
224
178k
  x1 = _mm256_unpacklo_epi64(u5, u7);
225
178k
  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
226
178k
  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
227
228
178k
  x0 = _mm256_unpackhi_epi64(u1, u3);
229
178k
  x1 = _mm256_unpackhi_epi64(u5, u7);
230
178k
  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
231
178k
  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
232
178k
}
233
234
static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
235
5.29M
                                           __m256i *out, int out_size) {
236
61.2M
  for (int i = 0; i < out_size; ++i) {
237
55.9M
    out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
238
55.9M
  }
239
5.29M
}
240
241
static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
242
97.6M
                                      const __m256i *rounding, int bit) {
243
97.6M
  __m256i x;
244
97.6M
  x = _mm256_mullo_epi32(*w0, *n0);
245
97.6M
  x = _mm256_add_epi32(x, *rounding);
246
97.6M
  x = _mm256_srai_epi32(x, bit);
247
97.6M
  return x;
248
97.6M
}
249
250
static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
251
                                    const __m256i *w1, const __m256i *n1,
252
206M
                                    const __m256i *rounding, int bit) {
253
206M
  __m256i x, y;
254
255
206M
  x = _mm256_mullo_epi32(*w0, *n0);
256
206M
  y = _mm256_mullo_epi32(*w1, *n1);
257
206M
  x = _mm256_add_epi32(x, y);
258
206M
  x = _mm256_add_epi32(x, *rounding);
259
206M
  x = _mm256_srai_epi32(x, bit);
260
206M
  return x;
261
206M
}
262
263
static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
264
                        __m256i *out1, const __m256i *clamp_lo,
265
377M
                        const __m256i *clamp_hi) {
266
377M
  __m256i a0 = _mm256_add_epi32(in0, in1);
267
377M
  __m256i a1 = _mm256_sub_epi32(in0, in1);
268
269
377M
  a0 = _mm256_max_epi32(a0, *clamp_lo);
270
377M
  a0 = _mm256_min_epi32(a0, *clamp_hi);
271
377M
  a1 = _mm256_max_epi32(a1, *clamp_lo);
272
377M
  a1 = _mm256_min_epi32(a1, *clamp_hi);
273
274
377M
  *out0 = a0;
275
377M
  *out1 = a1;
276
377M
}
277
278
static INLINE void idct32_stage4_avx2(
279
    __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
280
    const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
281
    const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
282
1.96M
    const __m256i *rounding, int bit) {
283
1.96M
  __m256i temp1, temp2;
284
1.96M
  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
285
1.96M
  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
286
1.96M
  bf1[17] = temp1;
287
288
1.96M
  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
289
1.96M
  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
290
1.96M
  bf1[18] = temp2;
291
292
1.96M
  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
293
1.96M
  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
294
1.96M
  bf1[21] = temp1;
295
296
1.96M
  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
297
1.96M
  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
298
1.96M
  bf1[22] = temp2;
299
1.96M
}
300
301
static INLINE void idct32_stage5_avx2(
302
    __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
303
    const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
304
1.96M
    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
305
1.96M
  __m256i temp1, temp2;
306
1.96M
  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
307
1.96M
  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
308
1.96M
  bf1[9] = temp1;
309
310
1.96M
  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
311
1.96M
  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
312
1.96M
  bf1[10] = temp2;
313
314
1.96M
  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
315
1.96M
  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
316
1.96M
  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
317
1.96M
  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
318
1.96M
  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
319
1.96M
  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
320
1.96M
  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
321
1.96M
  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
322
1.96M
}
323
324
static INLINE void idct32_stage6_avx2(
325
    __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
326
    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
327
    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
328
1.96M
    const __m256i *rounding, int bit) {
329
1.96M
  __m256i temp1, temp2;
330
1.96M
  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
331
1.96M
  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
332
1.96M
  bf1[5] = temp1;
333
334
1.96M
  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
335
1.96M
  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
336
1.96M
  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
337
1.96M
  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
338
339
1.96M
  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
340
1.96M
  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
341
1.96M
  bf1[18] = temp1;
342
1.96M
  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
343
1.96M
  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
344
1.96M
  bf1[19] = temp2;
345
1.96M
  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
346
1.96M
  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
347
1.96M
  bf1[20] = temp1;
348
1.96M
  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
349
1.96M
  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
350
1.96M
  bf1[21] = temp2;
351
1.96M
}
352
353
static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
354
                                      const __m256i *cospi32,
355
                                      const __m256i *clamp_lo,
356
                                      const __m256i *clamp_hi,
357
1.96M
                                      const __m256i *rounding, int bit) {
358
1.96M
  __m256i temp1, temp2;
359
1.96M
  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
360
1.96M
  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
361
1.96M
  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
362
1.96M
  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
363
364
1.96M
  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
365
1.96M
  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
366
1.96M
  bf1[10] = temp1;
367
1.96M
  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
368
1.96M
  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
369
1.96M
  bf1[11] = temp2;
370
371
1.96M
  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
372
1.96M
  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
373
1.96M
  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
374
1.96M
  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
375
1.96M
  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
376
1.96M
  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
377
1.96M
  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
378
1.96M
  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
379
1.96M
}
380
381
static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
382
                                      const __m256i *cospi32,
383
                                      const __m256i *clamp_lo,
384
                                      const __m256i *clamp_hi,
385
1.96M
                                      const __m256i *rounding, int bit) {
386
1.96M
  __m256i temp1, temp2;
387
1.96M
  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
388
1.96M
  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
389
1.96M
  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
390
1.96M
  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
391
1.96M
  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
392
1.96M
  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
393
1.96M
  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
394
1.96M
  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
395
396
1.96M
  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
397
1.96M
  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
398
1.96M
  bf1[20] = temp1;
399
1.96M
  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
400
1.96M
  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
401
1.96M
  bf1[21] = temp2;
402
1.96M
  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
403
1.96M
  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
404
1.96M
  bf1[22] = temp1;
405
1.96M
  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
406
1.96M
  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
407
1.96M
  bf1[23] = temp2;
408
1.96M
}
409
410
static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
411
                                      const int do_cols, const int bd,
412
                                      const int out_shift,
413
                                      const __m256i *clamp_lo,
414
1.96M
                                      const __m256i *clamp_hi) {
415
1.96M
  addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
416
1.96M
  addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
417
1.96M
  addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
418
1.96M
  addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
419
1.96M
  addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
420
1.96M
  addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
421
1.96M
  addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
422
1.96M
  addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
423
1.96M
  addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
424
1.96M
  addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
425
1.96M
  addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
426
1.96M
  addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
427
1.96M
  addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
428
1.96M
  addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
429
1.96M
  addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
430
1.96M
  addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
431
1.96M
  if (!do_cols) {
432
768k
    const int log_range_out = AOMMAX(16, bd + 6);
433
768k
    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
434
768k
    const __m256i clamp_hi_out =
435
768k
        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
436
768k
    round_shift_8x8_avx2(out, out_shift);
437
768k
    round_shift_8x8_avx2(out + 16, out_shift);
438
768k
    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
439
768k
  }
440
1.96M
}
441
442
static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
443
999k
                             int bd, int out_shift) {
444
999k
  const int32_t *cospi = cospi_arr(bit);
445
999k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
446
999k
  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
447
999k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
448
999k
  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
449
999k
  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
450
999k
  __m256i x;
451
  // stage 0
452
  // stage 1
453
  // stage 2
454
  // stage 3
455
  // stage 4
456
  // stage 5
457
999k
  x = _mm256_mullo_epi32(in[0], cospi32);
458
999k
  x = _mm256_add_epi32(x, rounding);
459
999k
  x = _mm256_srai_epi32(x, bit);
460
461
  // stage 6
462
  // stage 7
463
  // stage 8
464
  // stage 9
465
999k
  if (!do_cols) {
466
353k
    const int log_range_out = AOMMAX(16, bd + 6);
467
353k
    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
468
353k
    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
469
353k
    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
470
353k
    x = _mm256_add_epi32(offset, x);
471
353k
    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
472
353k
  }
473
999k
  x = _mm256_max_epi32(x, clamp_lo);
474
999k
  x = _mm256_min_epi32(x, clamp_hi);
475
999k
  out[0] = x;
476
999k
  out[1] = x;
477
999k
  out[2] = x;
478
999k
  out[3] = x;
479
999k
  out[4] = x;
480
999k
  out[5] = x;
481
999k
  out[6] = x;
482
999k
  out[7] = x;
483
999k
  out[8] = x;
484
999k
  out[9] = x;
485
999k
  out[10] = x;
486
999k
  out[11] = x;
487
999k
  out[12] = x;
488
999k
  out[13] = x;
489
999k
  out[14] = x;
490
999k
  out[15] = x;
491
999k
  out[16] = x;
492
999k
  out[17] = x;
493
999k
  out[18] = x;
494
999k
  out[19] = x;
495
999k
  out[20] = x;
496
999k
  out[21] = x;
497
999k
  out[22] = x;
498
999k
  out[23] = x;
499
999k
  out[24] = x;
500
999k
  out[25] = x;
501
999k
  out[26] = x;
502
999k
  out[27] = x;
503
999k
  out[28] = x;
504
999k
  out[29] = x;
505
999k
  out[30] = x;
506
999k
  out[31] = x;
507
999k
}
508
509
static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
510
1.50M
                             int bd, int out_shift) {
511
1.50M
  const int32_t *cospi = cospi_arr(bit);
512
1.50M
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
513
1.50M
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
514
1.50M
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
515
1.50M
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
516
1.50M
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
517
1.50M
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
518
1.50M
  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
519
1.50M
  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
520
1.50M
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
521
1.50M
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
522
1.50M
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
523
1.50M
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
524
1.50M
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
525
1.50M
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
526
1.50M
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
527
1.50M
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
528
1.50M
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
529
1.50M
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
530
1.50M
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
531
1.50M
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
532
1.50M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
533
1.50M
  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
534
1.50M
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
535
1.50M
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
536
1.50M
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
537
1.50M
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
538
1.50M
  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
539
1.50M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
540
1.50M
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
541
1.50M
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
542
1.50M
  __m256i bf1[32];
543
544
1.50M
  {
545
    // stage 0
546
    // stage 1
547
1.50M
    bf1[0] = in[0];
548
1.50M
    bf1[4] = in[4];
549
1.50M
    bf1[8] = in[2];
550
1.50M
    bf1[12] = in[6];
551
1.50M
    bf1[16] = in[1];
552
1.50M
    bf1[20] = in[5];
553
1.50M
    bf1[24] = in[3];
554
1.50M
    bf1[28] = in[7];
555
556
    // stage 2
557
1.50M
    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
558
1.50M
    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
559
1.50M
    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
560
1.50M
    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
561
1.50M
    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
562
1.50M
    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
563
1.50M
    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
564
1.50M
    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
565
566
    // stage 3
567
1.50M
    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
568
1.50M
    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
569
570
1.50M
    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
571
1.50M
    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
572
1.50M
    bf1[17] = bf1[16];
573
1.50M
    bf1[18] = bf1[19];
574
1.50M
    bf1[21] = bf1[20];
575
1.50M
    bf1[22] = bf1[23];
576
1.50M
    bf1[25] = bf1[24];
577
1.50M
    bf1[26] = bf1[27];
578
1.50M
    bf1[29] = bf1[28];
579
1.50M
    bf1[30] = bf1[31];
580
581
    // stage 4
582
1.50M
    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
583
1.50M
    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
584
585
1.50M
    bf1[9] = bf1[8];
586
1.50M
    bf1[10] = bf1[11];
587
1.50M
    bf1[13] = bf1[12];
588
1.50M
    bf1[14] = bf1[15];
589
590
1.50M
    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
591
1.50M
                       &cospi24, &cospi40, &cospim24, &rounding, bit);
592
593
    // stage 5
594
1.50M
    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
595
1.50M
    bf1[1] = bf1[0];
596
1.50M
    bf1[5] = bf1[4];
597
1.50M
    bf1[6] = bf1[7];
598
599
1.50M
    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
600
1.50M
                       &clamp_hi, &rounding, bit);
601
602
    // stage 6
603
1.50M
    bf1[3] = bf1[0];
604
1.50M
    bf1[2] = bf1[1];
605
606
1.50M
    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
607
1.50M
                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
608
609
    // stage 7
610
1.50M
    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
611
1.50M
                       &rounding, bit);
612
613
    // stage 8
614
1.50M
    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
615
1.50M
                       &rounding, bit);
616
617
    // stage 9
618
1.50M
    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
619
1.50M
  }
620
1.50M
}
621
622
static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
623
461k
                              int bd, int out_shift) {
624
461k
  const int32_t *cospi = cospi_arr(bit);
625
461k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
626
461k
  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
627
461k
  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
628
461k
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
629
461k
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
630
461k
  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
631
461k
  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
632
461k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
633
461k
  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
634
461k
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
635
461k
  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
636
461k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
637
461k
  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
638
461k
  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
639
461k
  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
640
461k
  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
641
461k
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
642
461k
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
643
461k
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
644
461k
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
645
461k
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
646
461k
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
647
461k
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
648
461k
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
649
461k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
650
461k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
651
461k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
652
461k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
653
461k
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
654
461k
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
655
461k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
656
461k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
657
461k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
658
461k
  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
659
461k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
660
461k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
661
461k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
662
461k
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
663
461k
  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
664
461k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
665
461k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
666
461k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
667
461k
  __m256i bf1[32];
668
669
461k
  {
670
    // stage 0
671
    // stage 1
672
461k
    bf1[0] = in[0];
673
461k
    bf1[2] = in[8];
674
461k
    bf1[4] = in[4];
675
461k
    bf1[6] = in[12];
676
461k
    bf1[8] = in[2];
677
461k
    bf1[10] = in[10];
678
461k
    bf1[12] = in[6];
679
461k
    bf1[14] = in[14];
680
461k
    bf1[16] = in[1];
681
461k
    bf1[18] = in[9];
682
461k
    bf1[20] = in[5];
683
461k
    bf1[22] = in[13];
684
461k
    bf1[24] = in[3];
685
461k
    bf1[26] = in[11];
686
461k
    bf1[28] = in[7];
687
461k
    bf1[30] = in[15];
688
689
    // stage 2
690
461k
    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
691
461k
    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
692
461k
    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
693
461k
    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
694
461k
    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
695
461k
    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
696
461k
    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
697
461k
    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
698
461k
    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
699
461k
    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
700
461k
    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
701
461k
    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
702
461k
    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
703
461k
    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
704
461k
    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
705
461k
    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
706
707
    // stage 3
708
461k
    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
709
461k
    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
710
461k
    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
711
461k
    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
712
461k
    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
713
461k
    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
714
461k
    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
715
461k
    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
716
717
461k
    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
718
461k
    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
719
461k
    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
720
461k
    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
721
461k
    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
722
461k
    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
723
461k
    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
724
461k
    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
725
726
    // stage 4
727
461k
    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
728
461k
    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
729
461k
    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
730
461k
    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
731
732
461k
    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
733
461k
    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
734
461k
    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
735
461k
    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
736
737
461k
    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
738
461k
                       &cospi24, &cospi40, &cospim24, &rounding, bit);
739
740
    // stage 5
741
461k
    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
742
461k
    bf1[1] = bf1[0];
743
461k
    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
744
461k
    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
745
746
461k
    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
747
461k
    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
748
749
461k
    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
750
461k
                       &clamp_hi, &rounding, bit);
751
752
    // stage 6
753
461k
    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
754
461k
    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
755
756
461k
    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
757
461k
                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
758
759
    // stage 7
760
461k
    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
761
461k
                       &rounding, bit);
762
763
    // stage 8
764
461k
    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
765
461k
                       &rounding, bit);
766
767
    // stage 9
768
461k
    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
769
461k
  }
770
461k
}
771
772
static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
773
321k
                        int out_shift) {
774
321k
  const int32_t *cospi = cospi_arr(bit);
775
321k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
776
321k
  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
777
321k
  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
778
321k
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
779
321k
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
780
321k
  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
781
321k
  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
782
321k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
783
321k
  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
784
321k
  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
785
321k
  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
786
321k
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
787
321k
  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
788
321k
  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
789
321k
  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
790
321k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
791
321k
  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
792
321k
  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
793
321k
  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
794
321k
  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
795
321k
  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
796
321k
  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
797
321k
  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
798
321k
  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
799
321k
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
800
321k
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
801
321k
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
802
321k
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
803
321k
  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
804
321k
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
805
321k
  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
806
321k
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
807
321k
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
808
321k
  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
809
321k
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
810
321k
  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
811
321k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
812
321k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
813
321k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
814
321k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
815
321k
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
816
321k
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
817
321k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
818
321k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
819
321k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
820
321k
  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
821
321k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
822
321k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
823
321k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
824
321k
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
825
321k
  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
826
321k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
827
321k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
828
321k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
829
321k
  __m256i bf1[32], bf0[32];
830
831
321k
  {
832
    // stage 0
833
    // stage 1
834
321k
    bf1[0] = in[0];
835
321k
    bf1[1] = in[16];
836
321k
    bf1[2] = in[8];
837
321k
    bf1[3] = in[24];
838
321k
    bf1[4] = in[4];
839
321k
    bf1[5] = in[20];
840
321k
    bf1[6] = in[12];
841
321k
    bf1[7] = in[28];
842
321k
    bf1[8] = in[2];
843
321k
    bf1[9] = in[18];
844
321k
    bf1[10] = in[10];
845
321k
    bf1[11] = in[26];
846
321k
    bf1[12] = in[6];
847
321k
    bf1[13] = in[22];
848
321k
    bf1[14] = in[14];
849
321k
    bf1[15] = in[30];
850
321k
    bf1[16] = in[1];
851
321k
    bf1[17] = in[17];
852
321k
    bf1[18] = in[9];
853
321k
    bf1[19] = in[25];
854
321k
    bf1[20] = in[5];
855
321k
    bf1[21] = in[21];
856
321k
    bf1[22] = in[13];
857
321k
    bf1[23] = in[29];
858
321k
    bf1[24] = in[3];
859
321k
    bf1[25] = in[19];
860
321k
    bf1[26] = in[11];
861
321k
    bf1[27] = in[27];
862
321k
    bf1[28] = in[7];
863
321k
    bf1[29] = in[23];
864
321k
    bf1[30] = in[15];
865
321k
    bf1[31] = in[31];
866
867
    // stage 2
868
321k
    bf0[0] = bf1[0];
869
321k
    bf0[1] = bf1[1];
870
321k
    bf0[2] = bf1[2];
871
321k
    bf0[3] = bf1[3];
872
321k
    bf0[4] = bf1[4];
873
321k
    bf0[5] = bf1[5];
874
321k
    bf0[6] = bf1[6];
875
321k
    bf0[7] = bf1[7];
876
321k
    bf0[8] = bf1[8];
877
321k
    bf0[9] = bf1[9];
878
321k
    bf0[10] = bf1[10];
879
321k
    bf0[11] = bf1[11];
880
321k
    bf0[12] = bf1[12];
881
321k
    bf0[13] = bf1[13];
882
321k
    bf0[14] = bf1[14];
883
321k
    bf0[15] = bf1[15];
884
321k
    bf0[16] =
885
321k
        half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
886
321k
    bf0[17] =
887
321k
        half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
888
321k
    bf0[18] =
889
321k
        half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
890
321k
    bf0[19] =
891
321k
        half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
892
321k
    bf0[20] =
893
321k
        half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
894
321k
    bf0[21] =
895
321k
        half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
896
321k
    bf0[22] =
897
321k
        half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
898
321k
    bf0[23] =
899
321k
        half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
900
321k
    bf0[24] =
901
321k
        half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
902
321k
    bf0[25] =
903
321k
        half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
904
321k
    bf0[26] =
905
321k
        half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
906
321k
    bf0[27] =
907
321k
        half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
908
321k
    bf0[28] =
909
321k
        half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
910
321k
    bf0[29] =
911
321k
        half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
912
321k
    bf0[30] =
913
321k
        half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
914
321k
    bf0[31] =
915
321k
        half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
916
917
    // stage 3
918
321k
    bf1[0] = bf0[0];
919
321k
    bf1[1] = bf0[1];
920
321k
    bf1[2] = bf0[2];
921
321k
    bf1[3] = bf0[3];
922
321k
    bf1[4] = bf0[4];
923
321k
    bf1[5] = bf0[5];
924
321k
    bf1[6] = bf0[6];
925
321k
    bf1[7] = bf0[7];
926
321k
    bf1[8] =
927
321k
        half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
928
321k
    bf1[9] =
929
321k
        half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
930
321k
    bf1[10] =
931
321k
        half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
932
321k
    bf1[11] =
933
321k
        half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
934
321k
    bf1[12] =
935
321k
        half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
936
321k
    bf1[13] =
937
321k
        half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
938
321k
    bf1[14] =
939
321k
        half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
940
321k
    bf1[15] =
941
321k
        half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
942
943
321k
    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
944
321k
    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
945
321k
    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
946
321k
    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
947
321k
    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
948
321k
    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
949
321k
    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
950
321k
    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
951
952
    // stage 4
953
321k
    bf0[0] = bf1[0];
954
321k
    bf0[1] = bf1[1];
955
321k
    bf0[2] = bf1[2];
956
321k
    bf0[3] = bf1[3];
957
321k
    bf0[4] =
958
321k
        half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
959
321k
    bf0[5] =
960
321k
        half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
961
321k
    bf0[6] =
962
321k
        half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
963
321k
    bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
964
965
321k
    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
966
321k
    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
967
321k
    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
968
321k
    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
969
970
321k
    bf0[16] = bf1[16];
971
321k
    bf0[17] =
972
321k
        half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
973
321k
    bf0[18] =
974
321k
        half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
975
321k
    bf0[19] = bf1[19];
976
321k
    bf0[20] = bf1[20];
977
321k
    bf0[21] =
978
321k
        half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
979
321k
    bf0[22] =
980
321k
        half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
981
321k
    bf0[23] = bf1[23];
982
321k
    bf0[24] = bf1[24];
983
321k
    bf0[25] =
984
321k
        half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
985
321k
    bf0[26] =
986
321k
        half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
987
321k
    bf0[27] = bf1[27];
988
321k
    bf0[28] = bf1[28];
989
321k
    bf0[29] =
990
321k
        half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
991
321k
    bf0[30] =
992
321k
        half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
993
321k
    bf0[31] = bf1[31];
994
995
    // stage 5
996
321k
    bf1[0] =
997
321k
        half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
998
321k
    bf1[1] =
999
321k
        half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
1000
321k
    bf1[2] =
1001
321k
        half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
1002
321k
    bf1[3] =
1003
321k
        half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
1004
321k
    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
1005
321k
    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
1006
321k
    bf1[8] = bf0[8];
1007
321k
    bf1[9] =
1008
321k
        half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
1009
321k
    bf1[10] =
1010
321k
        half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
1011
321k
    bf1[11] = bf0[11];
1012
321k
    bf1[12] = bf0[12];
1013
321k
    bf1[13] =
1014
321k
        half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
1015
321k
    bf1[14] =
1016
321k
        half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
1017
321k
    bf1[15] = bf0[15];
1018
321k
    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
1019
321k
    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
1020
321k
    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
1021
321k
    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
1022
321k
    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
1023
321k
    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
1024
321k
    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
1025
321k
    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
1026
1027
    // stage 6
1028
321k
    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
1029
321k
    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
1030
321k
    bf0[4] = bf1[4];
1031
321k
    bf0[5] =
1032
321k
        half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1033
321k
    bf0[6] =
1034
321k
        half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1035
321k
    bf0[7] = bf1[7];
1036
321k
    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
1037
321k
    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
1038
321k
    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
1039
321k
    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
1040
321k
    bf0[16] = bf1[16];
1041
321k
    bf0[17] = bf1[17];
1042
321k
    bf0[18] =
1043
321k
        half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
1044
321k
    bf0[19] =
1045
321k
        half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
1046
321k
    bf0[20] =
1047
321k
        half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
1048
321k
    bf0[21] =
1049
321k
        half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
1050
321k
    bf0[22] = bf1[22];
1051
321k
    bf0[23] = bf1[23];
1052
321k
    bf0[24] = bf1[24];
1053
321k
    bf0[25] = bf1[25];
1054
321k
    bf0[26] =
1055
321k
        half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
1056
321k
    bf0[27] =
1057
321k
        half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
1058
321k
    bf0[28] =
1059
321k
        half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
1060
321k
    bf0[29] =
1061
321k
        half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
1062
321k
    bf0[30] = bf1[30];
1063
321k
    bf0[31] = bf1[31];
1064
1065
    // stage 7
1066
321k
    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
1067
321k
    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
1068
321k
    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
1069
321k
    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
1070
321k
    bf1[8] = bf0[8];
1071
321k
    bf1[9] = bf0[9];
1072
321k
    bf1[10] =
1073
321k
        half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1074
321k
    bf1[11] =
1075
321k
        half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1076
321k
    bf1[12] =
1077
321k
        half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1078
321k
    bf1[13] =
1079
321k
        half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1080
321k
    bf1[14] = bf0[14];
1081
321k
    bf1[15] = bf0[15];
1082
321k
    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
1083
321k
    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
1084
321k
    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
1085
321k
    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
1086
321k
    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
1087
321k
    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
1088
321k
    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
1089
321k
    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
1090
1091
    // stage 8
1092
321k
    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
1093
321k
    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
1094
321k
    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
1095
321k
    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
1096
321k
    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
1097
321k
    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
1098
321k
    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
1099
321k
    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
1100
321k
    bf0[16] = bf1[16];
1101
321k
    bf0[17] = bf1[17];
1102
321k
    bf0[18] = bf1[18];
1103
321k
    bf0[19] = bf1[19];
1104
321k
    bf0[20] =
1105
321k
        half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1106
321k
    bf0[21] =
1107
321k
        half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1108
321k
    bf0[22] =
1109
321k
        half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1110
321k
    bf0[23] =
1111
321k
        half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1112
321k
    bf0[24] =
1113
321k
        half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1114
321k
    bf0[25] =
1115
321k
        half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1116
321k
    bf0[26] =
1117
321k
        half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1118
321k
    bf0[27] =
1119
321k
        half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1120
321k
    bf0[28] = bf1[28];
1121
321k
    bf0[29] = bf1[29];
1122
321k
    bf0[30] = bf1[30];
1123
321k
    bf0[31] = bf1[31];
1124
1125
    // stage 9
1126
321k
    addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
1127
321k
    addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
1128
321k
    addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
1129
321k
    addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
1130
321k
    addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
1131
321k
    addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
1132
321k
    addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
1133
321k
    addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
1134
321k
    addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
1135
321k
    addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
1136
321k
    addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
1137
321k
    addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
1138
321k
    addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
1139
321k
    addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
1140
321k
    addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
1141
321k
    addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
1142
321k
    if (!do_cols) {
1143
160k
      const int log_range_out = AOMMAX(16, bd + 6);
1144
160k
      const __m256i clamp_lo_out =
1145
160k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1146
160k
      const __m256i clamp_hi_out =
1147
160k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1148
160k
      round_shift_8x8_avx2(out, out_shift);
1149
160k
      round_shift_8x8_avx2(out + 16, out_shift);
1150
160k
      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
1151
160k
    }
1152
321k
  }
1153
321k
}
1154
static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1155
1.50M
                             int bd, int out_shift) {
1156
1.50M
  const int32_t *cospi = cospi_arr(bit);
1157
1.50M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1158
1.50M
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1159
1.50M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1160
1.50M
  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1161
1.50M
  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1162
1163
1.50M
  {
1164
    // stage 0
1165
    // stage 1
1166
    // stage 2
1167
    // stage 3
1168
    // stage 4
1169
1.50M
    in[0] = _mm256_mullo_epi32(in[0], cospi32);
1170
1.50M
    in[0] = _mm256_add_epi32(in[0], rnding);
1171
1.50M
    in[0] = _mm256_srai_epi32(in[0], bit);
1172
1173
    // stage 5
1174
    // stage 6
1175
    // stage 7
1176
1.50M
    if (!do_cols) {
1177
332k
      const int log_range_out = AOMMAX(16, bd + 6);
1178
332k
      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1179
332k
      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1180
332k
      __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
1181
332k
      in[0] = _mm256_add_epi32(in[0], offset);
1182
332k
      in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1183
332k
    }
1184
1.50M
    in[0] = _mm256_max_epi32(in[0], clamp_lo);
1185
1.50M
    in[0] = _mm256_min_epi32(in[0], clamp_hi);
1186
1.50M
    out[0] = in[0];
1187
1.50M
    out[1] = in[0];
1188
1.50M
    out[2] = in[0];
1189
1.50M
    out[3] = in[0];
1190
1.50M
    out[4] = in[0];
1191
1.50M
    out[5] = in[0];
1192
1.50M
    out[6] = in[0];
1193
1.50M
    out[7] = in[0];
1194
1.50M
    out[8] = in[0];
1195
1.50M
    out[9] = in[0];
1196
1.50M
    out[10] = in[0];
1197
1.50M
    out[11] = in[0];
1198
1.50M
    out[12] = in[0];
1199
1.50M
    out[13] = in[0];
1200
1.50M
    out[14] = in[0];
1201
1.50M
    out[15] = in[0];
1202
1.50M
  }
1203
1.50M
}
1204
1205
static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1206
1.98M
                             int bd, int out_shift) {
1207
1.98M
  const int32_t *cospi = cospi_arr(bit);
1208
1.98M
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1209
1.98M
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1210
1.98M
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1211
1.98M
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1212
1.98M
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1213
1.98M
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1214
1.98M
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1215
1.98M
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1216
1.98M
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1217
1.98M
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1218
1.98M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1219
1.98M
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1220
1.98M
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1221
1.98M
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1222
1.98M
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1223
1.98M
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1224
1.98M
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1225
1.98M
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1226
1.98M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1227
1.98M
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1228
1.98M
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1229
1.98M
  __m256i u[16], x, y;
1230
1231
1.98M
  {
1232
    // stage 0
1233
    // stage 1
1234
1.98M
    u[0] = in[0];
1235
1.98M
    u[2] = in[4];
1236
1.98M
    u[4] = in[2];
1237
1.98M
    u[6] = in[6];
1238
1.98M
    u[8] = in[1];
1239
1.98M
    u[10] = in[5];
1240
1.98M
    u[12] = in[3];
1241
1.98M
    u[14] = in[7];
1242
1243
    // stage 2
1244
1.98M
    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
1245
1.98M
    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
1246
1247
1.98M
    u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
1248
1.98M
    u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
1249
1250
1.98M
    u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
1251
1.98M
    u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
1252
1253
1.98M
    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
1254
1.98M
    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
1255
1256
    // stage 3
1257
1.98M
    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
1258
1.98M
    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
1259
1.98M
    u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
1260
1.98M
    u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
1261
1262
1.98M
    addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1263
1.98M
    addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1264
1.98M
    addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1265
1.98M
    addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1266
1267
    // stage 4
1268
1.98M
    x = _mm256_mullo_epi32(u[0], cospi32);
1269
1.98M
    u[0] = _mm256_add_epi32(x, rnding);
1270
1.98M
    u[0] = _mm256_srai_epi32(u[0], bit);
1271
1.98M
    u[1] = u[0];
1272
1273
1.98M
    u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
1274
1.98M
    u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
1275
1276
1.98M
    addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1277
1.98M
    addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1278
1279
1.98M
    x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1280
1.98M
    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1281
1.98M
    u[9] = x;
1282
1.98M
    y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1283
1.98M
    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1284
1.98M
    u[10] = y;
1285
1286
    // stage 5
1287
1.98M
    addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1288
1.98M
    addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1289
1290
1.98M
    x = _mm256_mullo_epi32(u[5], cospi32);
1291
1.98M
    y = _mm256_mullo_epi32(u[6], cospi32);
1292
1.98M
    u[5] = _mm256_sub_epi32(y, x);
1293
1.98M
    u[5] = _mm256_add_epi32(u[5], rnding);
1294
1.98M
    u[5] = _mm256_srai_epi32(u[5], bit);
1295
1296
1.98M
    u[6] = _mm256_add_epi32(y, x);
1297
1.98M
    u[6] = _mm256_add_epi32(u[6], rnding);
1298
1.98M
    u[6] = _mm256_srai_epi32(u[6], bit);
1299
1300
1.98M
    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1301
1.98M
    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1302
1.98M
    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1303
1.98M
    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1304
1305
    // stage 6
1306
1.98M
    addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1307
1.98M
    addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1308
1.98M
    addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1309
1.98M
    addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1310
1311
1.98M
    x = _mm256_mullo_epi32(u[10], cospi32);
1312
1.98M
    y = _mm256_mullo_epi32(u[13], cospi32);
1313
1.98M
    u[10] = _mm256_sub_epi32(y, x);
1314
1.98M
    u[10] = _mm256_add_epi32(u[10], rnding);
1315
1.98M
    u[10] = _mm256_srai_epi32(u[10], bit);
1316
1317
1.98M
    u[13] = _mm256_add_epi32(x, y);
1318
1.98M
    u[13] = _mm256_add_epi32(u[13], rnding);
1319
1.98M
    u[13] = _mm256_srai_epi32(u[13], bit);
1320
1321
1.98M
    x = _mm256_mullo_epi32(u[11], cospi32);
1322
1.98M
    y = _mm256_mullo_epi32(u[12], cospi32);
1323
1.98M
    u[11] = _mm256_sub_epi32(y, x);
1324
1.98M
    u[11] = _mm256_add_epi32(u[11], rnding);
1325
1.98M
    u[11] = _mm256_srai_epi32(u[11], bit);
1326
1327
1.98M
    u[12] = _mm256_add_epi32(x, y);
1328
1.98M
    u[12] = _mm256_add_epi32(u[12], rnding);
1329
1.98M
    u[12] = _mm256_srai_epi32(u[12], bit);
1330
    // stage 7
1331
1.98M
    addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1332
1.98M
    addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1333
1.98M
    addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1334
1.98M
    addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1335
1.98M
    addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1336
1.98M
    addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1337
1.98M
    addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1338
1.98M
    addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1339
1340
1.98M
    if (!do_cols) {
1341
408k
      const int log_range_out = AOMMAX(16, bd + 6);
1342
408k
      const __m256i clamp_lo_out =
1343
408k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1344
408k
      const __m256i clamp_hi_out =
1345
408k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1346
408k
      round_shift_8x8_avx2(out, out_shift);
1347
408k
      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1348
408k
    }
1349
1.98M
  }
1350
1.98M
}
1351
1352
static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
1353
1.22M
                        int out_shift) {
1354
1.22M
  const int32_t *cospi = cospi_arr(bit);
1355
1.22M
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1356
1.22M
  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1357
1.22M
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1358
1.22M
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1359
1.22M
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1360
1.22M
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1361
1.22M
  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1362
1.22M
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1363
1.22M
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1364
1.22M
  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1365
1.22M
  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1366
1.22M
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1367
1.22M
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1368
1.22M
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1369
1.22M
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1370
1.22M
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1371
1.22M
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1372
1.22M
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1373
1.22M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1374
1.22M
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1375
1.22M
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1376
1.22M
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1377
1.22M
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1378
1.22M
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1379
1.22M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1380
1.22M
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1381
1.22M
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1382
1.22M
  __m256i u[16], v[16], x, y;
1383
1384
1.22M
  {
1385
    // stage 0
1386
    // stage 1
1387
1.22M
    u[0] = in[0];
1388
1.22M
    u[1] = in[8];
1389
1.22M
    u[2] = in[4];
1390
1.22M
    u[3] = in[12];
1391
1.22M
    u[4] = in[2];
1392
1.22M
    u[5] = in[10];
1393
1.22M
    u[6] = in[6];
1394
1.22M
    u[7] = in[14];
1395
1.22M
    u[8] = in[1];
1396
1.22M
    u[9] = in[9];
1397
1.22M
    u[10] = in[5];
1398
1.22M
    u[11] = in[13];
1399
1.22M
    u[12] = in[3];
1400
1.22M
    u[13] = in[11];
1401
1.22M
    u[14] = in[7];
1402
1.22M
    u[15] = in[15];
1403
1404
    // stage 2
1405
1.22M
    v[0] = u[0];
1406
1.22M
    v[1] = u[1];
1407
1.22M
    v[2] = u[2];
1408
1.22M
    v[3] = u[3];
1409
1.22M
    v[4] = u[4];
1410
1.22M
    v[5] = u[5];
1411
1.22M
    v[6] = u[6];
1412
1.22M
    v[7] = u[7];
1413
1414
1.22M
    v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
1415
1.22M
    v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
1416
1.22M
    v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
1417
1.22M
    v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
1418
1.22M
    v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
1419
1.22M
    v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
1420
1.22M
    v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
1421
1.22M
    v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
1422
1423
    // stage 3
1424
1.22M
    u[0] = v[0];
1425
1.22M
    u[1] = v[1];
1426
1.22M
    u[2] = v[2];
1427
1.22M
    u[3] = v[3];
1428
1.22M
    u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
1429
1.22M
    u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
1430
1.22M
    u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
1431
1.22M
    u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
1432
1.22M
    addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1433
1.22M
    addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1434
1.22M
    addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1435
1.22M
    addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1436
1437
    // stage 4
1438
1.22M
    x = _mm256_mullo_epi32(u[0], cospi32);
1439
1.22M
    y = _mm256_mullo_epi32(u[1], cospi32);
1440
1.22M
    v[0] = _mm256_add_epi32(x, y);
1441
1.22M
    v[0] = _mm256_add_epi32(v[0], rnding);
1442
1.22M
    v[0] = _mm256_srai_epi32(v[0], bit);
1443
1444
1.22M
    v[1] = _mm256_sub_epi32(x, y);
1445
1.22M
    v[1] = _mm256_add_epi32(v[1], rnding);
1446
1.22M
    v[1] = _mm256_srai_epi32(v[1], bit);
1447
1448
1.22M
    v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
1449
1.22M
    v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
1450
1.22M
    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
1451
1.22M
    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
1452
1.22M
    v[8] = u[8];
1453
1.22M
    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1454
1.22M
    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1455
1.22M
    v[11] = u[11];
1456
1.22M
    v[12] = u[12];
1457
1.22M
    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1458
1.22M
    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1459
1.22M
    v[15] = u[15];
1460
1461
    // stage 5
1462
1.22M
    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1463
1.22M
    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1464
1.22M
    u[4] = v[4];
1465
1466
1.22M
    x = _mm256_mullo_epi32(v[5], cospi32);
1467
1.22M
    y = _mm256_mullo_epi32(v[6], cospi32);
1468
1.22M
    u[5] = _mm256_sub_epi32(y, x);
1469
1.22M
    u[5] = _mm256_add_epi32(u[5], rnding);
1470
1.22M
    u[5] = _mm256_srai_epi32(u[5], bit);
1471
1472
1.22M
    u[6] = _mm256_add_epi32(y, x);
1473
1.22M
    u[6] = _mm256_add_epi32(u[6], rnding);
1474
1.22M
    u[6] = _mm256_srai_epi32(u[6], bit);
1475
1476
1.22M
    u[7] = v[7];
1477
1.22M
    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1478
1.22M
    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1479
1.22M
    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1480
1.22M
    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1481
1482
    // stage 6
1483
1.22M
    addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
1484
1.22M
    addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
1485
1.22M
    addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
1486
1.22M
    addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
1487
1.22M
    v[8] = u[8];
1488
1.22M
    v[9] = u[9];
1489
1490
1.22M
    x = _mm256_mullo_epi32(u[10], cospi32);
1491
1.22M
    y = _mm256_mullo_epi32(u[13], cospi32);
1492
1.22M
    v[10] = _mm256_sub_epi32(y, x);
1493
1.22M
    v[10] = _mm256_add_epi32(v[10], rnding);
1494
1.22M
    v[10] = _mm256_srai_epi32(v[10], bit);
1495
1496
1.22M
    v[13] = _mm256_add_epi32(x, y);
1497
1.22M
    v[13] = _mm256_add_epi32(v[13], rnding);
1498
1.22M
    v[13] = _mm256_srai_epi32(v[13], bit);
1499
1500
1.22M
    x = _mm256_mullo_epi32(u[11], cospi32);
1501
1.22M
    y = _mm256_mullo_epi32(u[12], cospi32);
1502
1.22M
    v[11] = _mm256_sub_epi32(y, x);
1503
1.22M
    v[11] = _mm256_add_epi32(v[11], rnding);
1504
1.22M
    v[11] = _mm256_srai_epi32(v[11], bit);
1505
1506
1.22M
    v[12] = _mm256_add_epi32(x, y);
1507
1.22M
    v[12] = _mm256_add_epi32(v[12], rnding);
1508
1.22M
    v[12] = _mm256_srai_epi32(v[12], bit);
1509
1510
1.22M
    v[14] = u[14];
1511
1.22M
    v[15] = u[15];
1512
1513
    // stage 7
1514
1.22M
    addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1515
1.22M
    addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1516
1.22M
    addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1517
1.22M
    addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1518
1.22M
    addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1519
1.22M
    addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1520
1.22M
    addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1521
1.22M
    addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1522
1523
1.22M
    if (!do_cols) {
1524
290k
      const int log_range_out = AOMMAX(16, bd + 6);
1525
290k
      const __m256i clamp_lo_out =
1526
290k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1527
290k
      const __m256i clamp_hi_out =
1528
290k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1529
290k
      round_shift_8x8_avx2(out, out_shift);
1530
290k
      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1531
290k
    }
1532
1.22M
  }
1533
1.22M
}
1534
1535
static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1536
320k
                              int bd, int out_shift) {
1537
320k
  const int32_t *cospi = cospi_arr(bit);
1538
320k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1539
320k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1540
320k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1541
320k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1542
320k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1543
320k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1544
320k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1545
320k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1546
320k
  const __m256i zero = _mm256_setzero_si256();
1547
320k
  __m256i v[16], x, y, temp1, temp2;
1548
1549
  // Calculate the column 0, 1, 2, 3
1550
320k
  {
1551
    // stage 0
1552
    // stage 1
1553
    // stage 2
1554
320k
    x = _mm256_mullo_epi32(in[0], cospi62);
1555
320k
    v[0] = _mm256_add_epi32(x, rnding);
1556
320k
    v[0] = _mm256_srai_epi32(v[0], bit);
1557
1558
320k
    x = _mm256_mullo_epi32(in[0], cospi2);
1559
320k
    v[1] = _mm256_sub_epi32(zero, x);
1560
320k
    v[1] = _mm256_add_epi32(v[1], rnding);
1561
320k
    v[1] = _mm256_srai_epi32(v[1], bit);
1562
1563
    // stage 3
1564
320k
    v[8] = v[0];
1565
320k
    v[9] = v[1];
1566
1567
    // stage 4
1568
320k
    temp1 = _mm256_mullo_epi32(v[8], cospi8);
1569
320k
    x = _mm256_mullo_epi32(v[9], cospi56);
1570
320k
    temp1 = _mm256_add_epi32(temp1, x);
1571
320k
    temp1 = _mm256_add_epi32(temp1, rnding);
1572
320k
    temp1 = _mm256_srai_epi32(temp1, bit);
1573
1574
320k
    temp2 = _mm256_mullo_epi32(v[8], cospi56);
1575
320k
    x = _mm256_mullo_epi32(v[9], cospi8);
1576
320k
    temp2 = _mm256_sub_epi32(temp2, x);
1577
320k
    temp2 = _mm256_add_epi32(temp2, rnding);
1578
320k
    temp2 = _mm256_srai_epi32(temp2, bit);
1579
320k
    v[8] = temp1;
1580
320k
    v[9] = temp2;
1581
1582
    // stage 5
1583
320k
    v[4] = v[0];
1584
320k
    v[5] = v[1];
1585
320k
    v[12] = v[8];
1586
320k
    v[13] = v[9];
1587
1588
    // stage 6
1589
320k
    temp1 = _mm256_mullo_epi32(v[4], cospi16);
1590
320k
    x = _mm256_mullo_epi32(v[5], cospi48);
1591
320k
    temp1 = _mm256_add_epi32(temp1, x);
1592
320k
    temp1 = _mm256_add_epi32(temp1, rnding);
1593
320k
    temp1 = _mm256_srai_epi32(temp1, bit);
1594
1595
320k
    temp2 = _mm256_mullo_epi32(v[4], cospi48);
1596
320k
    x = _mm256_mullo_epi32(v[5], cospi16);
1597
320k
    temp2 = _mm256_sub_epi32(temp2, x);
1598
320k
    temp2 = _mm256_add_epi32(temp2, rnding);
1599
320k
    temp2 = _mm256_srai_epi32(temp2, bit);
1600
320k
    v[4] = temp1;
1601
320k
    v[5] = temp2;
1602
1603
320k
    temp1 = _mm256_mullo_epi32(v[12], cospi16);
1604
320k
    x = _mm256_mullo_epi32(v[13], cospi48);
1605
320k
    temp1 = _mm256_add_epi32(temp1, x);
1606
320k
    temp1 = _mm256_add_epi32(temp1, rnding);
1607
320k
    temp1 = _mm256_srai_epi32(temp1, bit);
1608
1609
320k
    temp2 = _mm256_mullo_epi32(v[12], cospi48);
1610
320k
    x = _mm256_mullo_epi32(v[13], cospi16);
1611
320k
    temp2 = _mm256_sub_epi32(temp2, x);
1612
320k
    temp2 = _mm256_add_epi32(temp2, rnding);
1613
320k
    temp2 = _mm256_srai_epi32(temp2, bit);
1614
320k
    v[12] = temp1;
1615
320k
    v[13] = temp2;
1616
1617
    // stage 7
1618
320k
    v[2] = v[0];
1619
320k
    v[3] = v[1];
1620
320k
    v[6] = v[4];
1621
320k
    v[7] = v[5];
1622
320k
    v[10] = v[8];
1623
320k
    v[11] = v[9];
1624
320k
    v[14] = v[12];
1625
320k
    v[15] = v[13];
1626
1627
    // stage 8
1628
320k
    y = _mm256_mullo_epi32(v[2], cospi32);
1629
320k
    x = _mm256_mullo_epi32(v[3], cospi32);
1630
320k
    v[2] = _mm256_add_epi32(y, x);
1631
320k
    v[2] = _mm256_add_epi32(v[2], rnding);
1632
320k
    v[2] = _mm256_srai_epi32(v[2], bit);
1633
1634
320k
    v[3] = _mm256_sub_epi32(y, x);
1635
320k
    v[3] = _mm256_add_epi32(v[3], rnding);
1636
320k
    v[3] = _mm256_srai_epi32(v[3], bit);
1637
1638
320k
    y = _mm256_mullo_epi32(v[6], cospi32);
1639
320k
    x = _mm256_mullo_epi32(v[7], cospi32);
1640
320k
    v[6] = _mm256_add_epi32(y, x);
1641
320k
    v[6] = _mm256_add_epi32(v[6], rnding);
1642
320k
    v[6] = _mm256_srai_epi32(v[6], bit);
1643
1644
320k
    v[7] = _mm256_sub_epi32(y, x);
1645
320k
    v[7] = _mm256_add_epi32(v[7], rnding);
1646
320k
    v[7] = _mm256_srai_epi32(v[7], bit);
1647
1648
320k
    y = _mm256_mullo_epi32(v[10], cospi32);
1649
320k
    x = _mm256_mullo_epi32(v[11], cospi32);
1650
320k
    v[10] = _mm256_add_epi32(y, x);
1651
320k
    v[10] = _mm256_add_epi32(v[10], rnding);
1652
320k
    v[10] = _mm256_srai_epi32(v[10], bit);
1653
1654
320k
    v[11] = _mm256_sub_epi32(y, x);
1655
320k
    v[11] = _mm256_add_epi32(v[11], rnding);
1656
320k
    v[11] = _mm256_srai_epi32(v[11], bit);
1657
1658
320k
    y = _mm256_mullo_epi32(v[14], cospi32);
1659
320k
    x = _mm256_mullo_epi32(v[15], cospi32);
1660
320k
    v[14] = _mm256_add_epi32(y, x);
1661
320k
    v[14] = _mm256_add_epi32(v[14], rnding);
1662
320k
    v[14] = _mm256_srai_epi32(v[14], bit);
1663
1664
320k
    v[15] = _mm256_sub_epi32(y, x);
1665
320k
    v[15] = _mm256_add_epi32(v[15], rnding);
1666
320k
    v[15] = _mm256_srai_epi32(v[15], bit);
1667
1668
    // stage 9
1669
320k
    if (do_cols) {
1670
163k
      out[0] = v[0];
1671
163k
      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
1672
163k
      out[2] = v[12];
1673
163k
      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
1674
163k
      out[4] = v[6];
1675
163k
      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
1676
163k
      out[6] = v[10];
1677
163k
      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
1678
163k
      out[8] = v[3];
1679
163k
      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
1680
163k
      out[10] = v[15];
1681
163k
      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
1682
163k
      out[12] = v[5];
1683
163k
      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
1684
163k
      out[14] = v[9];
1685
163k
      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
1686
163k
    } else {
1687
156k
      const int log_range_out = AOMMAX(16, bd + 6);
1688
156k
      const __m256i clamp_lo_out =
1689
156k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1690
156k
      const __m256i clamp_hi_out =
1691
156k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1692
1693
156k
      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1694
156k
                     out_shift);
1695
156k
      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
1696
156k
                     &clamp_hi_out, out_shift);
1697
156k
      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
1698
156k
                     &clamp_hi_out, out_shift);
1699
156k
      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
1700
156k
                     &clamp_hi_out, out_shift);
1701
156k
      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
1702
156k
                     &clamp_hi_out, out_shift);
1703
156k
      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
1704
156k
                     &clamp_hi_out, out_shift);
1705
156k
      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
1706
156k
                     &clamp_hi_out, out_shift);
1707
156k
      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
1708
156k
                     &clamp_hi_out, out_shift);
1709
156k
    }
1710
320k
  }
1711
320k
}
1712
1713
static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1714
782k
                              int bd, int out_shift) {
1715
782k
  const int32_t *cospi = cospi_arr(bit);
1716
782k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1717
782k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1718
782k
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1719
782k
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1720
782k
  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1721
782k
  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1722
782k
  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1723
782k
  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1724
782k
  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1725
782k
  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1726
782k
  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1727
782k
  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1728
782k
  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1729
782k
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1730
782k
  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1731
782k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1732
782k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1733
782k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1734
782k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1735
782k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1736
782k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1737
782k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1738
782k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1739
782k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1740
782k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1741
782k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1742
782k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1743
782k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1744
782k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1745
782k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1746
782k
  __m256i u[16], x, y;
1747
1748
782k
  {
1749
    // stage 0
1750
    // stage 1
1751
    // stage 2
1752
782k
    __m256i zero = _mm256_setzero_si256();
1753
782k
    x = _mm256_mullo_epi32(in[0], cospi62);
1754
782k
    u[0] = _mm256_add_epi32(x, rnding);
1755
782k
    u[0] = _mm256_srai_epi32(u[0], bit);
1756
1757
782k
    x = _mm256_mullo_epi32(in[0], cospi2);
1758
782k
    u[1] = _mm256_sub_epi32(zero, x);
1759
782k
    u[1] = _mm256_add_epi32(u[1], rnding);
1760
782k
    u[1] = _mm256_srai_epi32(u[1], bit);
1761
1762
782k
    x = _mm256_mullo_epi32(in[2], cospi54);
1763
782k
    u[2] = _mm256_add_epi32(x, rnding);
1764
782k
    u[2] = _mm256_srai_epi32(u[2], bit);
1765
1766
782k
    x = _mm256_mullo_epi32(in[2], cospi10);
1767
782k
    u[3] = _mm256_sub_epi32(zero, x);
1768
782k
    u[3] = _mm256_add_epi32(u[3], rnding);
1769
782k
    u[3] = _mm256_srai_epi32(u[3], bit);
1770
1771
782k
    x = _mm256_mullo_epi32(in[4], cospi46);
1772
782k
    u[4] = _mm256_add_epi32(x, rnding);
1773
782k
    u[4] = _mm256_srai_epi32(u[4], bit);
1774
1775
782k
    x = _mm256_mullo_epi32(in[4], cospi18);
1776
782k
    u[5] = _mm256_sub_epi32(zero, x);
1777
782k
    u[5] = _mm256_add_epi32(u[5], rnding);
1778
782k
    u[5] = _mm256_srai_epi32(u[5], bit);
1779
1780
782k
    x = _mm256_mullo_epi32(in[6], cospi38);
1781
782k
    u[6] = _mm256_add_epi32(x, rnding);
1782
782k
    u[6] = _mm256_srai_epi32(u[6], bit);
1783
1784
782k
    x = _mm256_mullo_epi32(in[6], cospi26);
1785
782k
    u[7] = _mm256_sub_epi32(zero, x);
1786
782k
    u[7] = _mm256_add_epi32(u[7], rnding);
1787
782k
    u[7] = _mm256_srai_epi32(u[7], bit);
1788
1789
782k
    u[8] = _mm256_mullo_epi32(in[7], cospi34);
1790
782k
    u[8] = _mm256_add_epi32(u[8], rnding);
1791
782k
    u[8] = _mm256_srai_epi32(u[8], bit);
1792
1793
782k
    u[9] = _mm256_mullo_epi32(in[7], cospi30);
1794
782k
    u[9] = _mm256_add_epi32(u[9], rnding);
1795
782k
    u[9] = _mm256_srai_epi32(u[9], bit);
1796
1797
782k
    u[10] = _mm256_mullo_epi32(in[5], cospi42);
1798
782k
    u[10] = _mm256_add_epi32(u[10], rnding);
1799
782k
    u[10] = _mm256_srai_epi32(u[10], bit);
1800
1801
782k
    u[11] = _mm256_mullo_epi32(in[5], cospi22);
1802
782k
    u[11] = _mm256_add_epi32(u[11], rnding);
1803
782k
    u[11] = _mm256_srai_epi32(u[11], bit);
1804
1805
782k
    u[12] = _mm256_mullo_epi32(in[3], cospi50);
1806
782k
    u[12] = _mm256_add_epi32(u[12], rnding);
1807
782k
    u[12] = _mm256_srai_epi32(u[12], bit);
1808
1809
782k
    u[13] = _mm256_mullo_epi32(in[3], cospi14);
1810
782k
    u[13] = _mm256_add_epi32(u[13], rnding);
1811
782k
    u[13] = _mm256_srai_epi32(u[13], bit);
1812
1813
782k
    u[14] = _mm256_mullo_epi32(in[1], cospi58);
1814
782k
    u[14] = _mm256_add_epi32(u[14], rnding);
1815
782k
    u[14] = _mm256_srai_epi32(u[14], bit);
1816
1817
782k
    u[15] = _mm256_mullo_epi32(in[1], cospi6);
1818
782k
    u[15] = _mm256_add_epi32(u[15], rnding);
1819
782k
    u[15] = _mm256_srai_epi32(u[15], bit);
1820
1821
    // stage 3
1822
782k
    addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
1823
782k
    addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
1824
782k
    addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
1825
782k
    addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
1826
782k
    addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
1827
782k
    addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
1828
782k
    addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
1829
782k
    addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
1830
1831
    // stage 4
1832
782k
    y = _mm256_mullo_epi32(u[8], cospi56);
1833
782k
    x = _mm256_mullo_epi32(u[9], cospi56);
1834
782k
    u[8] = _mm256_mullo_epi32(u[8], cospi8);
1835
782k
    u[8] = _mm256_add_epi32(u[8], x);
1836
782k
    u[8] = _mm256_add_epi32(u[8], rnding);
1837
782k
    u[8] = _mm256_srai_epi32(u[8], bit);
1838
1839
782k
    x = _mm256_mullo_epi32(u[9], cospi8);
1840
782k
    u[9] = _mm256_sub_epi32(y, x);
1841
782k
    u[9] = _mm256_add_epi32(u[9], rnding);
1842
782k
    u[9] = _mm256_srai_epi32(u[9], bit);
1843
1844
782k
    x = _mm256_mullo_epi32(u[11], cospi24);
1845
782k
    y = _mm256_mullo_epi32(u[10], cospi24);
1846
782k
    u[10] = _mm256_mullo_epi32(u[10], cospi40);
1847
782k
    u[10] = _mm256_add_epi32(u[10], x);
1848
782k
    u[10] = _mm256_add_epi32(u[10], rnding);
1849
782k
    u[10] = _mm256_srai_epi32(u[10], bit);
1850
1851
782k
    x = _mm256_mullo_epi32(u[11], cospi40);
1852
782k
    u[11] = _mm256_sub_epi32(y, x);
1853
782k
    u[11] = _mm256_add_epi32(u[11], rnding);
1854
782k
    u[11] = _mm256_srai_epi32(u[11], bit);
1855
1856
782k
    x = _mm256_mullo_epi32(u[13], cospi8);
1857
782k
    y = _mm256_mullo_epi32(u[12], cospi8);
1858
782k
    u[12] = _mm256_mullo_epi32(u[12], cospim56);
1859
782k
    u[12] = _mm256_add_epi32(u[12], x);
1860
782k
    u[12] = _mm256_add_epi32(u[12], rnding);
1861
782k
    u[12] = _mm256_srai_epi32(u[12], bit);
1862
1863
782k
    x = _mm256_mullo_epi32(u[13], cospim56);
1864
782k
    u[13] = _mm256_sub_epi32(y, x);
1865
782k
    u[13] = _mm256_add_epi32(u[13], rnding);
1866
782k
    u[13] = _mm256_srai_epi32(u[13], bit);
1867
1868
782k
    x = _mm256_mullo_epi32(u[15], cospi40);
1869
782k
    y = _mm256_mullo_epi32(u[14], cospi40);
1870
782k
    u[14] = _mm256_mullo_epi32(u[14], cospim24);
1871
782k
    u[14] = _mm256_add_epi32(u[14], x);
1872
782k
    u[14] = _mm256_add_epi32(u[14], rnding);
1873
782k
    u[14] = _mm256_srai_epi32(u[14], bit);
1874
1875
782k
    x = _mm256_mullo_epi32(u[15], cospim24);
1876
782k
    u[15] = _mm256_sub_epi32(y, x);
1877
782k
    u[15] = _mm256_add_epi32(u[15], rnding);
1878
782k
    u[15] = _mm256_srai_epi32(u[15], bit);
1879
1880
    // stage 5
1881
782k
    addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
1882
782k
    addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
1883
782k
    addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
1884
782k
    addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
1885
782k
    addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
1886
782k
    addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
1887
782k
    addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
1888
782k
    addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
1889
1890
    // stage 6
1891
782k
    x = _mm256_mullo_epi32(u[5], cospi48);
1892
782k
    y = _mm256_mullo_epi32(u[4], cospi48);
1893
782k
    u[4] = _mm256_mullo_epi32(u[4], cospi16);
1894
782k
    u[4] = _mm256_add_epi32(u[4], x);
1895
782k
    u[4] = _mm256_add_epi32(u[4], rnding);
1896
782k
    u[4] = _mm256_srai_epi32(u[4], bit);
1897
1898
782k
    x = _mm256_mullo_epi32(u[5], cospi16);
1899
782k
    u[5] = _mm256_sub_epi32(y, x);
1900
782k
    u[5] = _mm256_add_epi32(u[5], rnding);
1901
782k
    u[5] = _mm256_srai_epi32(u[5], bit);
1902
1903
782k
    x = _mm256_mullo_epi32(u[7], cospi16);
1904
782k
    y = _mm256_mullo_epi32(u[6], cospi16);
1905
782k
    u[6] = _mm256_mullo_epi32(u[6], cospim48);
1906
782k
    u[6] = _mm256_add_epi32(u[6], x);
1907
782k
    u[6] = _mm256_add_epi32(u[6], rnding);
1908
782k
    u[6] = _mm256_srai_epi32(u[6], bit);
1909
1910
782k
    x = _mm256_mullo_epi32(u[7], cospim48);
1911
782k
    u[7] = _mm256_sub_epi32(y, x);
1912
782k
    u[7] = _mm256_add_epi32(u[7], rnding);
1913
782k
    u[7] = _mm256_srai_epi32(u[7], bit);
1914
1915
782k
    x = _mm256_mullo_epi32(u[13], cospi48);
1916
782k
    y = _mm256_mullo_epi32(u[12], cospi48);
1917
782k
    u[12] = _mm256_mullo_epi32(u[12], cospi16);
1918
782k
    u[12] = _mm256_add_epi32(u[12], x);
1919
782k
    u[12] = _mm256_add_epi32(u[12], rnding);
1920
782k
    u[12] = _mm256_srai_epi32(u[12], bit);
1921
1922
782k
    x = _mm256_mullo_epi32(u[13], cospi16);
1923
782k
    u[13] = _mm256_sub_epi32(y, x);
1924
782k
    u[13] = _mm256_add_epi32(u[13], rnding);
1925
782k
    u[13] = _mm256_srai_epi32(u[13], bit);
1926
1927
782k
    x = _mm256_mullo_epi32(u[15], cospi16);
1928
782k
    y = _mm256_mullo_epi32(u[14], cospi16);
1929
782k
    u[14] = _mm256_mullo_epi32(u[14], cospim48);
1930
782k
    u[14] = _mm256_add_epi32(u[14], x);
1931
782k
    u[14] = _mm256_add_epi32(u[14], rnding);
1932
782k
    u[14] = _mm256_srai_epi32(u[14], bit);
1933
1934
782k
    x = _mm256_mullo_epi32(u[15], cospim48);
1935
782k
    u[15] = _mm256_sub_epi32(y, x);
1936
782k
    u[15] = _mm256_add_epi32(u[15], rnding);
1937
782k
    u[15] = _mm256_srai_epi32(u[15], bit);
1938
1939
    // stage 7
1940
782k
    addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
1941
782k
    addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
1942
782k
    addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
1943
782k
    addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
1944
782k
    addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
1945
782k
    addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
1946
782k
    addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
1947
782k
    addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
1948
1949
    // stage 8
1950
782k
    y = _mm256_mullo_epi32(u[2], cospi32);
1951
782k
    x = _mm256_mullo_epi32(u[3], cospi32);
1952
782k
    u[2] = _mm256_add_epi32(y, x);
1953
782k
    u[2] = _mm256_add_epi32(u[2], rnding);
1954
782k
    u[2] = _mm256_srai_epi32(u[2], bit);
1955
1956
782k
    u[3] = _mm256_sub_epi32(y, x);
1957
782k
    u[3] = _mm256_add_epi32(u[3], rnding);
1958
782k
    u[3] = _mm256_srai_epi32(u[3], bit);
1959
782k
    y = _mm256_mullo_epi32(u[6], cospi32);
1960
782k
    x = _mm256_mullo_epi32(u[7], cospi32);
1961
782k
    u[6] = _mm256_add_epi32(y, x);
1962
782k
    u[6] = _mm256_add_epi32(u[6], rnding);
1963
782k
    u[6] = _mm256_srai_epi32(u[6], bit);
1964
1965
782k
    u[7] = _mm256_sub_epi32(y, x);
1966
782k
    u[7] = _mm256_add_epi32(u[7], rnding);
1967
782k
    u[7] = _mm256_srai_epi32(u[7], bit);
1968
1969
782k
    y = _mm256_mullo_epi32(u[10], cospi32);
1970
782k
    x = _mm256_mullo_epi32(u[11], cospi32);
1971
782k
    u[10] = _mm256_add_epi32(y, x);
1972
782k
    u[10] = _mm256_add_epi32(u[10], rnding);
1973
782k
    u[10] = _mm256_srai_epi32(u[10], bit);
1974
1975
782k
    u[11] = _mm256_sub_epi32(y, x);
1976
782k
    u[11] = _mm256_add_epi32(u[11], rnding);
1977
782k
    u[11] = _mm256_srai_epi32(u[11], bit);
1978
1979
782k
    y = _mm256_mullo_epi32(u[14], cospi32);
1980
782k
    x = _mm256_mullo_epi32(u[15], cospi32);
1981
782k
    u[14] = _mm256_add_epi32(y, x);
1982
782k
    u[14] = _mm256_add_epi32(u[14], rnding);
1983
782k
    u[14] = _mm256_srai_epi32(u[14], bit);
1984
1985
782k
    u[15] = _mm256_sub_epi32(y, x);
1986
782k
    u[15] = _mm256_add_epi32(u[15], rnding);
1987
782k
    u[15] = _mm256_srai_epi32(u[15], bit);
1988
1989
    // stage 9
1990
782k
    if (do_cols) {
1991
417k
      out[0] = u[0];
1992
417k
      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
1993
417k
      out[2] = u[12];
1994
417k
      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
1995
417k
      out[4] = u[6];
1996
417k
      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
1997
417k
      out[6] = u[10];
1998
417k
      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
1999
417k
      out[8] = u[3];
2000
417k
      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
2001
417k
      out[10] = u[15];
2002
417k
      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
2003
417k
      out[12] = u[5];
2004
417k
      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
2005
417k
      out[14] = u[9];
2006
417k
      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
2007
417k
    } else {
2008
365k
      const int log_range_out = AOMMAX(16, bd + 6);
2009
365k
      const __m256i clamp_lo_out =
2010
365k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2011
365k
      const __m256i clamp_hi_out =
2012
365k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2013
2014
365k
      neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2015
365k
                     out_shift);
2016
365k
      neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2017
365k
                     &clamp_hi_out, out_shift);
2018
365k
      neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2019
365k
                     &clamp_hi_out, out_shift);
2020
365k
      neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2021
365k
                     &clamp_hi_out, out_shift);
2022
365k
      neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2023
365k
                     &clamp_hi_out, out_shift);
2024
365k
      neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2025
365k
                     &clamp_hi_out, out_shift);
2026
365k
      neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2027
365k
                     &clamp_hi_out, out_shift);
2028
365k
      neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2029
365k
                     &clamp_hi_out, out_shift);
2030
365k
    }
2031
782k
  }
2032
782k
}
2033
2034
static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2035
403k
                         int bd, int out_shift) {
2036
403k
  const int32_t *cospi = cospi_arr(bit);
2037
403k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
2038
403k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
2039
403k
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
2040
403k
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
2041
403k
  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
2042
403k
  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
2043
403k
  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
2044
403k
  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
2045
403k
  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
2046
403k
  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
2047
403k
  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
2048
403k
  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
2049
403k
  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
2050
403k
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
2051
403k
  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
2052
403k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
2053
403k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2054
403k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2055
403k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2056
403k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2057
403k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
2058
403k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
2059
403k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2060
403k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2061
403k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2062
403k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2063
403k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2064
403k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2065
403k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2066
403k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2067
403k
  __m256i u[16], v[16], x, y;
2068
2069
403k
  {
2070
    // stage 0
2071
    // stage 1
2072
    // stage 2
2073
403k
    v[0] = _mm256_mullo_epi32(in[15], cospi2);
2074
403k
    x = _mm256_mullo_epi32(in[0], cospi62);
2075
403k
    v[0] = _mm256_add_epi32(v[0], x);
2076
403k
    v[0] = _mm256_add_epi32(v[0], rnding);
2077
403k
    v[0] = _mm256_srai_epi32(v[0], bit);
2078
2079
403k
    v[1] = _mm256_mullo_epi32(in[15], cospi62);
2080
403k
    x = _mm256_mullo_epi32(in[0], cospi2);
2081
403k
    v[1] = _mm256_sub_epi32(v[1], x);
2082
403k
    v[1] = _mm256_add_epi32(v[1], rnding);
2083
403k
    v[1] = _mm256_srai_epi32(v[1], bit);
2084
2085
403k
    v[2] = _mm256_mullo_epi32(in[13], cospi10);
2086
403k
    x = _mm256_mullo_epi32(in[2], cospi54);
2087
403k
    v[2] = _mm256_add_epi32(v[2], x);
2088
403k
    v[2] = _mm256_add_epi32(v[2], rnding);
2089
403k
    v[2] = _mm256_srai_epi32(v[2], bit);
2090
2091
403k
    v[3] = _mm256_mullo_epi32(in[13], cospi54);
2092
403k
    x = _mm256_mullo_epi32(in[2], cospi10);
2093
403k
    v[3] = _mm256_sub_epi32(v[3], x);
2094
403k
    v[3] = _mm256_add_epi32(v[3], rnding);
2095
403k
    v[3] = _mm256_srai_epi32(v[3], bit);
2096
2097
403k
    v[4] = _mm256_mullo_epi32(in[11], cospi18);
2098
403k
    x = _mm256_mullo_epi32(in[4], cospi46);
2099
403k
    v[4] = _mm256_add_epi32(v[4], x);
2100
403k
    v[4] = _mm256_add_epi32(v[4], rnding);
2101
403k
    v[4] = _mm256_srai_epi32(v[4], bit);
2102
2103
403k
    v[5] = _mm256_mullo_epi32(in[11], cospi46);
2104
403k
    x = _mm256_mullo_epi32(in[4], cospi18);
2105
403k
    v[5] = _mm256_sub_epi32(v[5], x);
2106
403k
    v[5] = _mm256_add_epi32(v[5], rnding);
2107
403k
    v[5] = _mm256_srai_epi32(v[5], bit);
2108
2109
403k
    v[6] = _mm256_mullo_epi32(in[9], cospi26);
2110
403k
    x = _mm256_mullo_epi32(in[6], cospi38);
2111
403k
    v[6] = _mm256_add_epi32(v[6], x);
2112
403k
    v[6] = _mm256_add_epi32(v[6], rnding);
2113
403k
    v[6] = _mm256_srai_epi32(v[6], bit);
2114
2115
403k
    v[7] = _mm256_mullo_epi32(in[9], cospi38);
2116
403k
    x = _mm256_mullo_epi32(in[6], cospi26);
2117
403k
    v[7] = _mm256_sub_epi32(v[7], x);
2118
403k
    v[7] = _mm256_add_epi32(v[7], rnding);
2119
403k
    v[7] = _mm256_srai_epi32(v[7], bit);
2120
2121
403k
    v[8] = _mm256_mullo_epi32(in[7], cospi34);
2122
403k
    x = _mm256_mullo_epi32(in[8], cospi30);
2123
403k
    v[8] = _mm256_add_epi32(v[8], x);
2124
403k
    v[8] = _mm256_add_epi32(v[8], rnding);
2125
403k
    v[8] = _mm256_srai_epi32(v[8], bit);
2126
2127
403k
    v[9] = _mm256_mullo_epi32(in[7], cospi30);
2128
403k
    x = _mm256_mullo_epi32(in[8], cospi34);
2129
403k
    v[9] = _mm256_sub_epi32(v[9], x);
2130
403k
    v[9] = _mm256_add_epi32(v[9], rnding);
2131
403k
    v[9] = _mm256_srai_epi32(v[9], bit);
2132
2133
403k
    v[10] = _mm256_mullo_epi32(in[5], cospi42);
2134
403k
    x = _mm256_mullo_epi32(in[10], cospi22);
2135
403k
    v[10] = _mm256_add_epi32(v[10], x);
2136
403k
    v[10] = _mm256_add_epi32(v[10], rnding);
2137
403k
    v[10] = _mm256_srai_epi32(v[10], bit);
2138
2139
403k
    v[11] = _mm256_mullo_epi32(in[5], cospi22);
2140
403k
    x = _mm256_mullo_epi32(in[10], cospi42);
2141
403k
    v[11] = _mm256_sub_epi32(v[11], x);
2142
403k
    v[11] = _mm256_add_epi32(v[11], rnding);
2143
403k
    v[11] = _mm256_srai_epi32(v[11], bit);
2144
2145
403k
    v[12] = _mm256_mullo_epi32(in[3], cospi50);
2146
403k
    x = _mm256_mullo_epi32(in[12], cospi14);
2147
403k
    v[12] = _mm256_add_epi32(v[12], x);
2148
403k
    v[12] = _mm256_add_epi32(v[12], rnding);
2149
403k
    v[12] = _mm256_srai_epi32(v[12], bit);
2150
2151
403k
    v[13] = _mm256_mullo_epi32(in[3], cospi14);
2152
403k
    x = _mm256_mullo_epi32(in[12], cospi50);
2153
403k
    v[13] = _mm256_sub_epi32(v[13], x);
2154
403k
    v[13] = _mm256_add_epi32(v[13], rnding);
2155
403k
    v[13] = _mm256_srai_epi32(v[13], bit);
2156
2157
403k
    v[14] = _mm256_mullo_epi32(in[1], cospi58);
2158
403k
    x = _mm256_mullo_epi32(in[14], cospi6);
2159
403k
    v[14] = _mm256_add_epi32(v[14], x);
2160
403k
    v[14] = _mm256_add_epi32(v[14], rnding);
2161
403k
    v[14] = _mm256_srai_epi32(v[14], bit);
2162
2163
403k
    v[15] = _mm256_mullo_epi32(in[1], cospi6);
2164
403k
    x = _mm256_mullo_epi32(in[14], cospi58);
2165
403k
    v[15] = _mm256_sub_epi32(v[15], x);
2166
403k
    v[15] = _mm256_add_epi32(v[15], rnding);
2167
403k
    v[15] = _mm256_srai_epi32(v[15], bit);
2168
2169
    // stage 3
2170
403k
    addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2171
403k
    addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2172
403k
    addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2173
403k
    addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2174
403k
    addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2175
403k
    addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2176
403k
    addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2177
403k
    addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2178
2179
    // stage 4
2180
403k
    v[0] = u[0];
2181
403k
    v[1] = u[1];
2182
403k
    v[2] = u[2];
2183
403k
    v[3] = u[3];
2184
403k
    v[4] = u[4];
2185
403k
    v[5] = u[5];
2186
403k
    v[6] = u[6];
2187
403k
    v[7] = u[7];
2188
2189
403k
    v[8] = _mm256_mullo_epi32(u[8], cospi8);
2190
403k
    x = _mm256_mullo_epi32(u[9], cospi56);
2191
403k
    v[8] = _mm256_add_epi32(v[8], x);
2192
403k
    v[8] = _mm256_add_epi32(v[8], rnding);
2193
403k
    v[8] = _mm256_srai_epi32(v[8], bit);
2194
2195
403k
    v[9] = _mm256_mullo_epi32(u[8], cospi56);
2196
403k
    x = _mm256_mullo_epi32(u[9], cospi8);
2197
403k
    v[9] = _mm256_sub_epi32(v[9], x);
2198
403k
    v[9] = _mm256_add_epi32(v[9], rnding);
2199
403k
    v[9] = _mm256_srai_epi32(v[9], bit);
2200
2201
403k
    v[10] = _mm256_mullo_epi32(u[10], cospi40);
2202
403k
    x = _mm256_mullo_epi32(u[11], cospi24);
2203
403k
    v[10] = _mm256_add_epi32(v[10], x);
2204
403k
    v[10] = _mm256_add_epi32(v[10], rnding);
2205
403k
    v[10] = _mm256_srai_epi32(v[10], bit);
2206
2207
403k
    v[11] = _mm256_mullo_epi32(u[10], cospi24);
2208
403k
    x = _mm256_mullo_epi32(u[11], cospi40);
2209
403k
    v[11] = _mm256_sub_epi32(v[11], x);
2210
403k
    v[11] = _mm256_add_epi32(v[11], rnding);
2211
403k
    v[11] = _mm256_srai_epi32(v[11], bit);
2212
2213
403k
    v[12] = _mm256_mullo_epi32(u[12], cospim56);
2214
403k
    x = _mm256_mullo_epi32(u[13], cospi8);
2215
403k
    v[12] = _mm256_add_epi32(v[12], x);
2216
403k
    v[12] = _mm256_add_epi32(v[12], rnding);
2217
403k
    v[12] = _mm256_srai_epi32(v[12], bit);
2218
2219
403k
    v[13] = _mm256_mullo_epi32(u[12], cospi8);
2220
403k
    x = _mm256_mullo_epi32(u[13], cospim56);
2221
403k
    v[13] = _mm256_sub_epi32(v[13], x);
2222
403k
    v[13] = _mm256_add_epi32(v[13], rnding);
2223
403k
    v[13] = _mm256_srai_epi32(v[13], bit);
2224
2225
403k
    v[14] = _mm256_mullo_epi32(u[14], cospim24);
2226
403k
    x = _mm256_mullo_epi32(u[15], cospi40);
2227
403k
    v[14] = _mm256_add_epi32(v[14], x);
2228
403k
    v[14] = _mm256_add_epi32(v[14], rnding);
2229
403k
    v[14] = _mm256_srai_epi32(v[14], bit);
2230
2231
403k
    v[15] = _mm256_mullo_epi32(u[14], cospi40);
2232
403k
    x = _mm256_mullo_epi32(u[15], cospim24);
2233
403k
    v[15] = _mm256_sub_epi32(v[15], x);
2234
403k
    v[15] = _mm256_add_epi32(v[15], rnding);
2235
403k
    v[15] = _mm256_srai_epi32(v[15], bit);
2236
2237
    // stage 5
2238
403k
    addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2239
403k
    addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2240
403k
    addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2241
403k
    addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2242
403k
    addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2243
403k
    addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2244
403k
    addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2245
403k
    addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2246
2247
    // stage 6
2248
403k
    v[0] = u[0];
2249
403k
    v[1] = u[1];
2250
403k
    v[2] = u[2];
2251
403k
    v[3] = u[3];
2252
2253
403k
    v[4] = _mm256_mullo_epi32(u[4], cospi16);
2254
403k
    x = _mm256_mullo_epi32(u[5], cospi48);
2255
403k
    v[4] = _mm256_add_epi32(v[4], x);
2256
403k
    v[4] = _mm256_add_epi32(v[4], rnding);
2257
403k
    v[4] = _mm256_srai_epi32(v[4], bit);
2258
2259
403k
    v[5] = _mm256_mullo_epi32(u[4], cospi48);
2260
403k
    x = _mm256_mullo_epi32(u[5], cospi16);
2261
403k
    v[5] = _mm256_sub_epi32(v[5], x);
2262
403k
    v[5] = _mm256_add_epi32(v[5], rnding);
2263
403k
    v[5] = _mm256_srai_epi32(v[5], bit);
2264
2265
403k
    v[6] = _mm256_mullo_epi32(u[6], cospim48);
2266
403k
    x = _mm256_mullo_epi32(u[7], cospi16);
2267
403k
    v[6] = _mm256_add_epi32(v[6], x);
2268
403k
    v[6] = _mm256_add_epi32(v[6], rnding);
2269
403k
    v[6] = _mm256_srai_epi32(v[6], bit);
2270
2271
403k
    v[7] = _mm256_mullo_epi32(u[6], cospi16);
2272
403k
    x = _mm256_mullo_epi32(u[7], cospim48);
2273
403k
    v[7] = _mm256_sub_epi32(v[7], x);
2274
403k
    v[7] = _mm256_add_epi32(v[7], rnding);
2275
403k
    v[7] = _mm256_srai_epi32(v[7], bit);
2276
2277
403k
    v[8] = u[8];
2278
403k
    v[9] = u[9];
2279
403k
    v[10] = u[10];
2280
403k
    v[11] = u[11];
2281
2282
403k
    v[12] = _mm256_mullo_epi32(u[12], cospi16);
2283
403k
    x = _mm256_mullo_epi32(u[13], cospi48);
2284
403k
    v[12] = _mm256_add_epi32(v[12], x);
2285
403k
    v[12] = _mm256_add_epi32(v[12], rnding);
2286
403k
    v[12] = _mm256_srai_epi32(v[12], bit);
2287
2288
403k
    v[13] = _mm256_mullo_epi32(u[12], cospi48);
2289
403k
    x = _mm256_mullo_epi32(u[13], cospi16);
2290
403k
    v[13] = _mm256_sub_epi32(v[13], x);
2291
403k
    v[13] = _mm256_add_epi32(v[13], rnding);
2292
403k
    v[13] = _mm256_srai_epi32(v[13], bit);
2293
2294
403k
    v[14] = _mm256_mullo_epi32(u[14], cospim48);
2295
403k
    x = _mm256_mullo_epi32(u[15], cospi16);
2296
403k
    v[14] = _mm256_add_epi32(v[14], x);
2297
403k
    v[14] = _mm256_add_epi32(v[14], rnding);
2298
403k
    v[14] = _mm256_srai_epi32(v[14], bit);
2299
2300
403k
    v[15] = _mm256_mullo_epi32(u[14], cospi16);
2301
403k
    x = _mm256_mullo_epi32(u[15], cospim48);
2302
403k
    v[15] = _mm256_sub_epi32(v[15], x);
2303
403k
    v[15] = _mm256_add_epi32(v[15], rnding);
2304
403k
    v[15] = _mm256_srai_epi32(v[15], bit);
2305
2306
    // stage 7
2307
403k
    addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2308
403k
    addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2309
403k
    addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2310
403k
    addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2311
403k
    addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2312
403k
    addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2313
403k
    addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2314
403k
    addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2315
2316
    // stage 8
2317
403k
    v[0] = u[0];
2318
403k
    v[1] = u[1];
2319
2320
403k
    y = _mm256_mullo_epi32(u[2], cospi32);
2321
403k
    x = _mm256_mullo_epi32(u[3], cospi32);
2322
403k
    v[2] = _mm256_add_epi32(y, x);
2323
403k
    v[2] = _mm256_add_epi32(v[2], rnding);
2324
403k
    v[2] = _mm256_srai_epi32(v[2], bit);
2325
2326
403k
    v[3] = _mm256_sub_epi32(y, x);
2327
403k
    v[3] = _mm256_add_epi32(v[3], rnding);
2328
403k
    v[3] = _mm256_srai_epi32(v[3], bit);
2329
2330
403k
    v[4] = u[4];
2331
403k
    v[5] = u[5];
2332
2333
403k
    y = _mm256_mullo_epi32(u[6], cospi32);
2334
403k
    x = _mm256_mullo_epi32(u[7], cospi32);
2335
403k
    v[6] = _mm256_add_epi32(y, x);
2336
403k
    v[6] = _mm256_add_epi32(v[6], rnding);
2337
403k
    v[6] = _mm256_srai_epi32(v[6], bit);
2338
2339
403k
    v[7] = _mm256_sub_epi32(y, x);
2340
403k
    v[7] = _mm256_add_epi32(v[7], rnding);
2341
403k
    v[7] = _mm256_srai_epi32(v[7], bit);
2342
2343
403k
    v[8] = u[8];
2344
403k
    v[9] = u[9];
2345
2346
403k
    y = _mm256_mullo_epi32(u[10], cospi32);
2347
403k
    x = _mm256_mullo_epi32(u[11], cospi32);
2348
403k
    v[10] = _mm256_add_epi32(y, x);
2349
403k
    v[10] = _mm256_add_epi32(v[10], rnding);
2350
403k
    v[10] = _mm256_srai_epi32(v[10], bit);
2351
2352
403k
    v[11] = _mm256_sub_epi32(y, x);
2353
403k
    v[11] = _mm256_add_epi32(v[11], rnding);
2354
403k
    v[11] = _mm256_srai_epi32(v[11], bit);
2355
2356
403k
    v[12] = u[12];
2357
403k
    v[13] = u[13];
2358
2359
403k
    y = _mm256_mullo_epi32(u[14], cospi32);
2360
403k
    x = _mm256_mullo_epi32(u[15], cospi32);
2361
403k
    v[14] = _mm256_add_epi32(y, x);
2362
403k
    v[14] = _mm256_add_epi32(v[14], rnding);
2363
403k
    v[14] = _mm256_srai_epi32(v[14], bit);
2364
2365
403k
    v[15] = _mm256_sub_epi32(y, x);
2366
403k
    v[15] = _mm256_add_epi32(v[15], rnding);
2367
403k
    v[15] = _mm256_srai_epi32(v[15], bit);
2368
2369
    // stage 9
2370
403k
    if (do_cols) {
2371
167k
      out[0] = v[0];
2372
167k
      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
2373
167k
      out[2] = v[12];
2374
167k
      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
2375
167k
      out[4] = v[6];
2376
167k
      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
2377
167k
      out[6] = v[10];
2378
167k
      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
2379
167k
      out[8] = v[3];
2380
167k
      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
2381
167k
      out[10] = v[15];
2382
167k
      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
2383
167k
      out[12] = v[5];
2384
167k
      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
2385
167k
      out[14] = v[9];
2386
167k
      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
2387
235k
    } else {
2388
235k
      const int log_range_out = AOMMAX(16, bd + 6);
2389
235k
      const __m256i clamp_lo_out =
2390
235k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2391
235k
      const __m256i clamp_hi_out =
2392
235k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2393
2394
235k
      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2395
235k
                     out_shift);
2396
235k
      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2397
235k
                     &clamp_hi_out, out_shift);
2398
235k
      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2399
235k
                     &clamp_hi_out, out_shift);
2400
235k
      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2401
235k
                     &clamp_hi_out, out_shift);
2402
235k
      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2403
235k
                     &clamp_hi_out, out_shift);
2404
235k
      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2405
235k
                     &clamp_hi_out, out_shift);
2406
235k
      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2407
235k
                     &clamp_hi_out, out_shift);
2408
235k
      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2409
235k
                     &clamp_hi_out, out_shift);
2410
235k
    }
2411
403k
  }
2412
403k
}
2413
static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2414
1.65M
                              int bd, int out_shift) {
2415
1.65M
  const int32_t *cospi = cospi_arr(bit);
2416
1.65M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2417
1.65M
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2418
1.65M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2419
1.65M
  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2420
1.65M
  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2421
1.65M
  __m256i x;
2422
2423
  // stage 0
2424
  // stage 1
2425
  // stage 2
2426
  // stage 3
2427
1.65M
  x = _mm256_mullo_epi32(in[0], cospi32);
2428
1.65M
  x = _mm256_add_epi32(x, rnding);
2429
1.65M
  x = _mm256_srai_epi32(x, bit);
2430
2431
  // stage 4
2432
  // stage 5
2433
1.65M
  if (!do_cols) {
2434
416k
    const int log_range_out = AOMMAX(16, bd + 6);
2435
416k
    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2436
416k
    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2437
416k
    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2438
416k
    x = _mm256_add_epi32(x, offset);
2439
416k
    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2440
416k
  }
2441
1.65M
  x = _mm256_max_epi32(x, clamp_lo);
2442
1.65M
  x = _mm256_min_epi32(x, clamp_hi);
2443
1.65M
  out[0] = x;
2444
1.65M
  out[1] = x;
2445
1.65M
  out[2] = x;
2446
1.65M
  out[3] = x;
2447
1.65M
  out[4] = x;
2448
1.65M
  out[5] = x;
2449
1.65M
  out[6] = x;
2450
1.65M
  out[7] = x;
2451
1.65M
}
2452
static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2453
3.04M
                         int bd, int out_shift) {
2454
3.04M
  const int32_t *cospi = cospi_arr(bit);
2455
3.04M
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2456
3.04M
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2457
3.04M
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2458
3.04M
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2459
3.04M
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2460
3.04M
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2461
3.04M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2462
3.04M
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2463
3.04M
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2464
3.04M
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2465
3.04M
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2466
3.04M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2467
3.04M
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2468
3.04M
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2469
3.04M
  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
2470
3.04M
  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
2471
3.04M
  __m256i x, y;
2472
2473
  // stage 0
2474
  // stage 1
2475
  // stage 2
2476
3.04M
  u0 = in[0];
2477
3.04M
  u1 = in[4];
2478
3.04M
  u2 = in[2];
2479
3.04M
  u3 = in[6];
2480
2481
3.04M
  x = _mm256_mullo_epi32(in[1], cospi56);
2482
3.04M
  y = _mm256_mullo_epi32(in[7], cospim8);
2483
3.04M
  u4 = _mm256_add_epi32(x, y);
2484
3.04M
  u4 = _mm256_add_epi32(u4, rnding);
2485
3.04M
  u4 = _mm256_srai_epi32(u4, bit);
2486
2487
3.04M
  x = _mm256_mullo_epi32(in[1], cospi8);
2488
3.04M
  y = _mm256_mullo_epi32(in[7], cospi56);
2489
3.04M
  u7 = _mm256_add_epi32(x, y);
2490
3.04M
  u7 = _mm256_add_epi32(u7, rnding);
2491
3.04M
  u7 = _mm256_srai_epi32(u7, bit);
2492
2493
3.04M
  x = _mm256_mullo_epi32(in[5], cospi24);
2494
3.04M
  y = _mm256_mullo_epi32(in[3], cospim40);
2495
3.04M
  u5 = _mm256_add_epi32(x, y);
2496
3.04M
  u5 = _mm256_add_epi32(u5, rnding);
2497
3.04M
  u5 = _mm256_srai_epi32(u5, bit);
2498
2499
3.04M
  x = _mm256_mullo_epi32(in[5], cospi40);
2500
3.04M
  y = _mm256_mullo_epi32(in[3], cospi24);
2501
3.04M
  u6 = _mm256_add_epi32(x, y);
2502
3.04M
  u6 = _mm256_add_epi32(u6, rnding);
2503
3.04M
  u6 = _mm256_srai_epi32(u6, bit);
2504
2505
  // stage 3
2506
3.04M
  x = _mm256_mullo_epi32(u0, cospi32);
2507
3.04M
  y = _mm256_mullo_epi32(u1, cospi32);
2508
3.04M
  v0 = _mm256_add_epi32(x, y);
2509
3.04M
  v0 = _mm256_add_epi32(v0, rnding);
2510
3.04M
  v0 = _mm256_srai_epi32(v0, bit);
2511
2512
3.04M
  v1 = _mm256_sub_epi32(x, y);
2513
3.04M
  v1 = _mm256_add_epi32(v1, rnding);
2514
3.04M
  v1 = _mm256_srai_epi32(v1, bit);
2515
2516
3.04M
  x = _mm256_mullo_epi32(u2, cospi48);
2517
3.04M
  y = _mm256_mullo_epi32(u3, cospim16);
2518
3.04M
  v2 = _mm256_add_epi32(x, y);
2519
3.04M
  v2 = _mm256_add_epi32(v2, rnding);
2520
3.04M
  v2 = _mm256_srai_epi32(v2, bit);
2521
2522
3.04M
  x = _mm256_mullo_epi32(u2, cospi16);
2523
3.04M
  y = _mm256_mullo_epi32(u3, cospi48);
2524
3.04M
  v3 = _mm256_add_epi32(x, y);
2525
3.04M
  v3 = _mm256_add_epi32(v3, rnding);
2526
3.04M
  v3 = _mm256_srai_epi32(v3, bit);
2527
2528
3.04M
  addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2529
3.04M
  addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2530
2531
  // stage 4
2532
3.04M
  addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2533
3.04M
  addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2534
3.04M
  u4 = v4;
2535
3.04M
  u7 = v7;
2536
2537
3.04M
  x = _mm256_mullo_epi32(v5, cospi32);
2538
3.04M
  y = _mm256_mullo_epi32(v6, cospi32);
2539
3.04M
  u6 = _mm256_add_epi32(y, x);
2540
3.04M
  u6 = _mm256_add_epi32(u6, rnding);
2541
3.04M
  u6 = _mm256_srai_epi32(u6, bit);
2542
2543
3.04M
  u5 = _mm256_sub_epi32(y, x);
2544
3.04M
  u5 = _mm256_add_epi32(u5, rnding);
2545
3.04M
  u5 = _mm256_srai_epi32(u5, bit);
2546
2547
3.04M
  addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
2548
3.04M
  addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
2549
3.04M
  addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
2550
3.04M
  addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
2551
  // stage 5
2552
3.04M
  if (!do_cols) {
2553
657k
    const int log_range_out = AOMMAX(16, bd + 6);
2554
657k
    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2555
657k
    const __m256i clamp_hi_out =
2556
657k
        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2557
2558
657k
    round_shift_4x4_avx2(out, out_shift);
2559
657k
    round_shift_4x4_avx2(out + 4, out_shift);
2560
657k
    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
2561
657k
  }
2562
3.04M
}
2563
static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2564
517k
                               int bd, int out_shift) {
2565
517k
  const int32_t *cospi = cospi_arr(bit);
2566
517k
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2567
517k
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2568
517k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2569
517k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2570
517k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2571
517k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2572
517k
  const __m256i kZero = _mm256_setzero_si256();
2573
517k
  __m256i u[8], x;
2574
2575
  // stage 0
2576
  // stage 1
2577
  // stage 2
2578
2579
517k
  x = _mm256_mullo_epi32(in[0], cospi60);
2580
517k
  u[0] = _mm256_add_epi32(x, rnding);
2581
517k
  u[0] = _mm256_srai_epi32(u[0], bit);
2582
2583
517k
  x = _mm256_mullo_epi32(in[0], cospi4);
2584
517k
  u[1] = _mm256_sub_epi32(kZero, x);
2585
517k
  u[1] = _mm256_add_epi32(u[1], rnding);
2586
517k
  u[1] = _mm256_srai_epi32(u[1], bit);
2587
2588
  // stage 3
2589
  // stage 4
2590
517k
  __m256i temp1, temp2;
2591
517k
  temp1 = _mm256_mullo_epi32(u[0], cospi16);
2592
517k
  x = _mm256_mullo_epi32(u[1], cospi48);
2593
517k
  temp1 = _mm256_add_epi32(temp1, x);
2594
517k
  temp1 = _mm256_add_epi32(temp1, rnding);
2595
517k
  temp1 = _mm256_srai_epi32(temp1, bit);
2596
517k
  u[4] = temp1;
2597
2598
517k
  temp2 = _mm256_mullo_epi32(u[0], cospi48);
2599
517k
  x = _mm256_mullo_epi32(u[1], cospi16);
2600
517k
  u[5] = _mm256_sub_epi32(temp2, x);
2601
517k
  u[5] = _mm256_add_epi32(u[5], rnding);
2602
517k
  u[5] = _mm256_srai_epi32(u[5], bit);
2603
2604
  // stage 5
2605
  // stage 6
2606
517k
  temp1 = _mm256_mullo_epi32(u[0], cospi32);
2607
517k
  x = _mm256_mullo_epi32(u[1], cospi32);
2608
517k
  u[2] = _mm256_add_epi32(temp1, x);
2609
517k
  u[2] = _mm256_add_epi32(u[2], rnding);
2610
517k
  u[2] = _mm256_srai_epi32(u[2], bit);
2611
2612
517k
  u[3] = _mm256_sub_epi32(temp1, x);
2613
517k
  u[3] = _mm256_add_epi32(u[3], rnding);
2614
517k
  u[3] = _mm256_srai_epi32(u[3], bit);
2615
2616
517k
  temp1 = _mm256_mullo_epi32(u[4], cospi32);
2617
517k
  x = _mm256_mullo_epi32(u[5], cospi32);
2618
517k
  u[6] = _mm256_add_epi32(temp1, x);
2619
517k
  u[6] = _mm256_add_epi32(u[6], rnding);
2620
517k
  u[6] = _mm256_srai_epi32(u[6], bit);
2621
2622
517k
  u[7] = _mm256_sub_epi32(temp1, x);
2623
517k
  u[7] = _mm256_add_epi32(u[7], rnding);
2624
517k
  u[7] = _mm256_srai_epi32(u[7], bit);
2625
2626
  // stage 7
2627
517k
  if (do_cols) {
2628
259k
    out[0] = u[0];
2629
259k
    out[1] = _mm256_sub_epi32(kZero, u[4]);
2630
259k
    out[2] = u[6];
2631
259k
    out[3] = _mm256_sub_epi32(kZero, u[2]);
2632
259k
    out[4] = u[3];
2633
259k
    out[5] = _mm256_sub_epi32(kZero, u[7]);
2634
259k
    out[6] = u[5];
2635
259k
    out[7] = _mm256_sub_epi32(kZero, u[1]);
2636
259k
  } else {
2637
258k
    const int log_range_out = AOMMAX(16, bd + 6);
2638
258k
    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2639
258k
    const __m256i clamp_hi_out =
2640
258k
        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2641
2642
258k
    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2643
258k
                   out_shift);
2644
258k
    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2645
258k
                   out_shift);
2646
258k
    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2647
258k
                   out_shift);
2648
258k
    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2649
258k
                   out_shift);
2650
258k
  }
2651
517k
}
2652
2653
static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2654
1.00M
                          int bd, int out_shift) {
2655
1.00M
  const int32_t *cospi = cospi_arr(bit);
2656
1.00M
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2657
1.00M
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2658
1.00M
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2659
1.00M
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2660
1.00M
  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
2661
1.00M
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2662
1.00M
  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
2663
1.00M
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2664
1.00M
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2665
1.00M
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2666
1.00M
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2667
1.00M
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2668
1.00M
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2669
1.00M
  const __m256i kZero = _mm256_setzero_si256();
2670
1.00M
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2671
1.00M
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2672
1.00M
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2673
1.00M
  __m256i u[8], v[8], x;
2674
2675
  // stage 0
2676
  // stage 1
2677
  // stage 2
2678
2679
1.00M
  u[0] = _mm256_mullo_epi32(in[7], cospi4);
2680
1.00M
  x = _mm256_mullo_epi32(in[0], cospi60);
2681
1.00M
  u[0] = _mm256_add_epi32(u[0], x);
2682
1.00M
  u[0] = _mm256_add_epi32(u[0], rnding);
2683
1.00M
  u[0] = _mm256_srai_epi32(u[0], bit);
2684
2685
1.00M
  u[1] = _mm256_mullo_epi32(in[7], cospi60);
2686
1.00M
  x = _mm256_mullo_epi32(in[0], cospi4);
2687
1.00M
  u[1] = _mm256_sub_epi32(u[1], x);
2688
1.00M
  u[1] = _mm256_add_epi32(u[1], rnding);
2689
1.00M
  u[1] = _mm256_srai_epi32(u[1], bit);
2690
2691
1.00M
  u[2] = _mm256_mullo_epi32(in[5], cospi20);
2692
1.00M
  x = _mm256_mullo_epi32(in[2], cospi44);
2693
1.00M
  u[2] = _mm256_add_epi32(u[2], x);
2694
1.00M
  u[2] = _mm256_add_epi32(u[2], rnding);
2695
1.00M
  u[2] = _mm256_srai_epi32(u[2], bit);
2696
2697
1.00M
  u[3] = _mm256_mullo_epi32(in[5], cospi44);
2698
1.00M
  x = _mm256_mullo_epi32(in[2], cospi20);
2699
1.00M
  u[3] = _mm256_sub_epi32(u[3], x);
2700
1.00M
  u[3] = _mm256_add_epi32(u[3], rnding);
2701
1.00M
  u[3] = _mm256_srai_epi32(u[3], bit);
2702
2703
1.00M
  u[4] = _mm256_mullo_epi32(in[3], cospi36);
2704
1.00M
  x = _mm256_mullo_epi32(in[4], cospi28);
2705
1.00M
  u[4] = _mm256_add_epi32(u[4], x);
2706
1.00M
  u[4] = _mm256_add_epi32(u[4], rnding);
2707
1.00M
  u[4] = _mm256_srai_epi32(u[4], bit);
2708
2709
1.00M
  u[5] = _mm256_mullo_epi32(in[3], cospi28);
2710
1.00M
  x = _mm256_mullo_epi32(in[4], cospi36);
2711
1.00M
  u[5] = _mm256_sub_epi32(u[5], x);
2712
1.00M
  u[5] = _mm256_add_epi32(u[5], rnding);
2713
1.00M
  u[5] = _mm256_srai_epi32(u[5], bit);
2714
2715
1.00M
  u[6] = _mm256_mullo_epi32(in[1], cospi52);
2716
1.00M
  x = _mm256_mullo_epi32(in[6], cospi12);
2717
1.00M
  u[6] = _mm256_add_epi32(u[6], x);
2718
1.00M
  u[6] = _mm256_add_epi32(u[6], rnding);
2719
1.00M
  u[6] = _mm256_srai_epi32(u[6], bit);
2720
2721
1.00M
  u[7] = _mm256_mullo_epi32(in[1], cospi12);
2722
1.00M
  x = _mm256_mullo_epi32(in[6], cospi52);
2723
1.00M
  u[7] = _mm256_sub_epi32(u[7], x);
2724
1.00M
  u[7] = _mm256_add_epi32(u[7], rnding);
2725
1.00M
  u[7] = _mm256_srai_epi32(u[7], bit);
2726
2727
  // stage 3
2728
1.00M
  addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2729
1.00M
  addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2730
1.00M
  addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2731
1.00M
  addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2732
2733
  // stage 4
2734
1.00M
  u[0] = v[0];
2735
1.00M
  u[1] = v[1];
2736
1.00M
  u[2] = v[2];
2737
1.00M
  u[3] = v[3];
2738
2739
1.00M
  u[4] = _mm256_mullo_epi32(v[4], cospi16);
2740
1.00M
  x = _mm256_mullo_epi32(v[5], cospi48);
2741
1.00M
  u[4] = _mm256_add_epi32(u[4], x);
2742
1.00M
  u[4] = _mm256_add_epi32(u[4], rnding);
2743
1.00M
  u[4] = _mm256_srai_epi32(u[4], bit);
2744
2745
1.00M
  u[5] = _mm256_mullo_epi32(v[4], cospi48);
2746
1.00M
  x = _mm256_mullo_epi32(v[5], cospi16);
2747
1.00M
  u[5] = _mm256_sub_epi32(u[5], x);
2748
1.00M
  u[5] = _mm256_add_epi32(u[5], rnding);
2749
1.00M
  u[5] = _mm256_srai_epi32(u[5], bit);
2750
2751
1.00M
  u[6] = _mm256_mullo_epi32(v[6], cospim48);
2752
1.00M
  x = _mm256_mullo_epi32(v[7], cospi16);
2753
1.00M
  u[6] = _mm256_add_epi32(u[6], x);
2754
1.00M
  u[6] = _mm256_add_epi32(u[6], rnding);
2755
1.00M
  u[6] = _mm256_srai_epi32(u[6], bit);
2756
2757
1.00M
  u[7] = _mm256_mullo_epi32(v[6], cospi16);
2758
1.00M
  x = _mm256_mullo_epi32(v[7], cospim48);
2759
1.00M
  u[7] = _mm256_sub_epi32(u[7], x);
2760
1.00M
  u[7] = _mm256_add_epi32(u[7], rnding);
2761
1.00M
  u[7] = _mm256_srai_epi32(u[7], bit);
2762
2763
  // stage 5
2764
1.00M
  addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2765
1.00M
  addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2766
1.00M
  addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2767
1.00M
  addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2768
2769
  // stage 6
2770
1.00M
  u[0] = v[0];
2771
1.00M
  u[1] = v[1];
2772
1.00M
  u[4] = v[4];
2773
1.00M
  u[5] = v[5];
2774
2775
1.00M
  v[0] = _mm256_mullo_epi32(v[2], cospi32);
2776
1.00M
  x = _mm256_mullo_epi32(v[3], cospi32);
2777
1.00M
  u[2] = _mm256_add_epi32(v[0], x);
2778
1.00M
  u[2] = _mm256_add_epi32(u[2], rnding);
2779
1.00M
  u[2] = _mm256_srai_epi32(u[2], bit);
2780
2781
1.00M
  u[3] = _mm256_sub_epi32(v[0], x);
2782
1.00M
  u[3] = _mm256_add_epi32(u[3], rnding);
2783
1.00M
  u[3] = _mm256_srai_epi32(u[3], bit);
2784
2785
1.00M
  v[0] = _mm256_mullo_epi32(v[6], cospi32);
2786
1.00M
  x = _mm256_mullo_epi32(v[7], cospi32);
2787
1.00M
  u[6] = _mm256_add_epi32(v[0], x);
2788
1.00M
  u[6] = _mm256_add_epi32(u[6], rnding);
2789
1.00M
  u[6] = _mm256_srai_epi32(u[6], bit);
2790
2791
1.00M
  u[7] = _mm256_sub_epi32(v[0], x);
2792
1.00M
  u[7] = _mm256_add_epi32(u[7], rnding);
2793
1.00M
  u[7] = _mm256_srai_epi32(u[7], bit);
2794
2795
  // stage 7
2796
1.00M
  if (do_cols) {
2797
561k
    out[0] = u[0];
2798
561k
    out[1] = _mm256_sub_epi32(kZero, u[4]);
2799
561k
    out[2] = u[6];
2800
561k
    out[3] = _mm256_sub_epi32(kZero, u[2]);
2801
561k
    out[4] = u[3];
2802
561k
    out[5] = _mm256_sub_epi32(kZero, u[7]);
2803
561k
    out[6] = u[5];
2804
561k
    out[7] = _mm256_sub_epi32(kZero, u[1]);
2805
561k
  } else {
2806
443k
    const int log_range_out = AOMMAX(16, bd + 6);
2807
443k
    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2808
443k
    const __m256i clamp_hi_out =
2809
443k
        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2810
2811
443k
    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2812
443k
                   out_shift);
2813
443k
    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2814
443k
                   out_shift);
2815
443k
    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2816
443k
                   out_shift);
2817
443k
    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2818
443k
                   out_shift);
2819
443k
  }
2820
1.00M
}
2821
static INLINE void idct64_stage8_avx2(
2822
    __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
2823
    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
2824
    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
2825
719k
    const __m256i *rnding, int bit) {
2826
719k
  int i;
2827
719k
  __m256i temp1, temp2, temp3, temp4;
2828
719k
  temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2829
719k
  u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2830
719k
  u[10] = temp1;
2831
719k
  temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2832
719k
  u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2833
719k
  u[11] = temp2;
2834
2835
3.59M
  for (i = 16; i < 20; ++i) {
2836
2.87M
    addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2837
2.87M
    addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2838
2.87M
  }
2839
2840
719k
  temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2841
719k
  temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2842
719k
  temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2843
719k
  temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
2844
719k
  u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
2845
719k
  u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
2846
719k
  u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
2847
719k
  u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
2848
719k
  u[36] = temp1;
2849
719k
  u[37] = temp2;
2850
719k
  u[38] = temp3;
2851
719k
  u[39] = temp4;
2852
2853
719k
  temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
2854
719k
  temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
2855
719k
  temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
2856
719k
  temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
2857
719k
  u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
2858
719k
  u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
2859
719k
  u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
2860
719k
  u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
2861
719k
  u[40] = temp1;
2862
719k
  u[41] = temp2;
2863
719k
  u[42] = temp3;
2864
719k
  u[43] = temp4;
2865
719k
}
2866
2867
static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
2868
                                      const __m256i *cospi32,
2869
                                      const __m256i *clamp_lo,
2870
                                      const __m256i *clamp_hi,
2871
719k
                                      const __m256i *rnding, int bit) {
2872
719k
  int i;
2873
719k
  __m256i temp1, temp2, temp3, temp4;
2874
6.47M
  for (i = 0; i < 8; ++i) {
2875
5.75M
    addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2876
5.75M
  }
2877
2878
719k
  temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
2879
719k
  temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
2880
719k
  temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
2881
719k
  temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
2882
719k
  u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
2883
719k
  u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
2884
719k
  u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
2885
719k
  u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
2886
719k
  u[20] = temp1;
2887
719k
  u[21] = temp2;
2888
719k
  u[22] = temp3;
2889
719k
  u[23] = temp4;
2890
6.47M
  for (i = 32; i < 40; i++) {
2891
5.75M
    addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2892
5.75M
  }
2893
2894
6.47M
  for (i = 48; i < 56; i++) {
2895
5.75M
    addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2896
5.75M
  }
2897
719k
}
2898
2899
static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
2900
                                       const __m256i *cospi32,
2901
                                       const __m256i *clamp_lo,
2902
                                       const __m256i *clamp_hi,
2903
719k
                                       const __m256i *rnding, int bit) {
2904
719k
  __m256i temp1, temp2, temp3, temp4;
2905
12.2M
  for (int i = 0; i < 16; i++) {
2906
11.5M
    addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
2907
11.5M
  }
2908
2909
719k
  temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
2910
719k
  temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
2911
719k
  temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
2912
719k
  temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
2913
719k
  u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
2914
719k
  u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
2915
719k
  u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
2916
719k
  u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
2917
719k
  u[40] = temp1;
2918
719k
  u[41] = temp2;
2919
719k
  u[42] = temp3;
2920
719k
  u[43] = temp4;
2921
2922
719k
  temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
2923
719k
  temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
2924
719k
  temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
2925
719k
  temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
2926
719k
  u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
2927
719k
  u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
2928
719k
  u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
2929
719k
  u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
2930
719k
  u[44] = temp1;
2931
719k
  u[45] = temp2;
2932
719k
  u[46] = temp3;
2933
719k
  u[47] = temp4;
2934
719k
}
2935
2936
static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
2937
                                       int bd, int out_shift,
2938
                                       const __m256i *clamp_lo,
2939
719k
                                       const __m256i *clamp_hi) {
2940
23.7M
  for (int i = 0; i < 32; i++) {
2941
23.0M
    addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
2942
23.0M
  }
2943
2944
719k
  if (!do_cols) {
2945
264k
    const int log_range_out = AOMMAX(16, bd + 6);
2946
264k
    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2947
264k
    const __m256i clamp_hi_out =
2948
264k
        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2949
2950
264k
    round_shift_8x8_avx2(out, out_shift);
2951
264k
    round_shift_8x8_avx2(out + 16, out_shift);
2952
264k
    round_shift_8x8_avx2(out + 32, out_shift);
2953
264k
    round_shift_8x8_avx2(out + 48, out_shift);
2954
264k
    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
2955
264k
  }
2956
719k
}
2957
2958
static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2959
291k
                             int bd, int out_shift) {
2960
291k
  const int32_t *cospi = cospi_arr(bit);
2961
291k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2962
291k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2963
291k
  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2964
291k
  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2965
2966
291k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2967
2968
291k
  {
2969
291k
    __m256i x;
2970
2971
    // stage 1
2972
    // stage 2
2973
    // stage 3
2974
    // stage 4
2975
    // stage 5
2976
    // stage 6
2977
291k
    x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
2978
2979
    // stage 8
2980
    // stage 9
2981
    // stage 10
2982
    // stage 11
2983
291k
    if (!do_cols) {
2984
78.0k
      const int log_range_out = AOMMAX(16, bd + 6);
2985
78.0k
      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2986
78.0k
      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2987
78.0k
      if (out_shift != 0) {
2988
78.0k
        __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2989
78.0k
        x = _mm256_add_epi32(x, offset);
2990
78.0k
        x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2991
78.0k
      }
2992
78.0k
    }
2993
291k
    x = _mm256_max_epi32(x, clamp_lo);
2994
291k
    x = _mm256_min_epi32(x, clamp_hi);
2995
291k
    out[0] = x;
2996
291k
    out[1] = x;
2997
291k
    out[2] = x;
2998
291k
    out[3] = x;
2999
291k
    out[4] = x;
3000
291k
    out[5] = x;
3001
291k
    out[6] = x;
3002
291k
    out[7] = x;
3003
291k
    out[8] = x;
3004
291k
    out[9] = x;
3005
291k
    out[10] = x;
3006
291k
    out[11] = x;
3007
291k
    out[12] = x;
3008
291k
    out[13] = x;
3009
291k
    out[14] = x;
3010
291k
    out[15] = x;
3011
291k
    out[16] = x;
3012
291k
    out[17] = x;
3013
291k
    out[18] = x;
3014
291k
    out[19] = x;
3015
291k
    out[20] = x;
3016
291k
    out[21] = x;
3017
291k
    out[22] = x;
3018
291k
    out[23] = x;
3019
291k
    out[24] = x;
3020
291k
    out[25] = x;
3021
291k
    out[26] = x;
3022
291k
    out[27] = x;
3023
291k
    out[28] = x;
3024
291k
    out[29] = x;
3025
291k
    out[30] = x;
3026
291k
    out[31] = x;
3027
291k
    out[32] = x;
3028
291k
    out[33] = x;
3029
291k
    out[34] = x;
3030
291k
    out[35] = x;
3031
291k
    out[36] = x;
3032
291k
    out[37] = x;
3033
291k
    out[38] = x;
3034
291k
    out[39] = x;
3035
291k
    out[40] = x;
3036
291k
    out[41] = x;
3037
291k
    out[42] = x;
3038
291k
    out[43] = x;
3039
291k
    out[44] = x;
3040
291k
    out[45] = x;
3041
291k
    out[46] = x;
3042
291k
    out[47] = x;
3043
291k
    out[48] = x;
3044
291k
    out[49] = x;
3045
291k
    out[50] = x;
3046
291k
    out[51] = x;
3047
291k
    out[52] = x;
3048
291k
    out[53] = x;
3049
291k
    out[54] = x;
3050
291k
    out[55] = x;
3051
291k
    out[56] = x;
3052
291k
    out[57] = x;
3053
291k
    out[58] = x;
3054
291k
    out[59] = x;
3055
291k
    out[60] = x;
3056
291k
    out[61] = x;
3057
291k
    out[62] = x;
3058
291k
    out[63] = x;
3059
291k
  }
3060
291k
}
3061
static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3062
393k
                             int bd, int out_shift) {
3063
393k
  int i, j;
3064
393k
  const int32_t *cospi = cospi_arr(bit);
3065
393k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3066
393k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3067
393k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3068
393k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3069
3070
393k
  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3071
393k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3072
393k
  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3073
393k
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3074
393k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3075
393k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3076
393k
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3077
393k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3078
393k
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3079
393k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3080
393k
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3081
393k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3082
393k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3083
393k
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3084
393k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3085
393k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3086
393k
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3087
393k
  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3088
393k
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3089
393k
  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3090
393k
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3091
393k
  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3092
393k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3093
393k
  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3094
393k
  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3095
393k
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3096
393k
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3097
393k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3098
393k
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3099
393k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3100
393k
  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3101
393k
  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3102
393k
  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3103
393k
  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3104
393k
  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3105
393k
  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3106
393k
  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3107
393k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3108
3109
393k
  {
3110
393k
    __m256i u[64];
3111
3112
    // stage 1
3113
393k
    u[0] = in[0];
3114
393k
    u[8] = in[4];
3115
393k
    u[16] = in[2];
3116
393k
    u[24] = in[6];
3117
393k
    u[32] = in[1];
3118
393k
    u[40] = in[5];
3119
393k
    u[48] = in[3];
3120
393k
    u[56] = in[7];
3121
3122
    // stage 2
3123
393k
    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3124
393k
    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3125
393k
    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3126
393k
    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3127
393k
    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3128
393k
    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3129
393k
    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3130
393k
    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3131
3132
    // stage 3
3133
393k
    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3134
393k
    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3135
393k
    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3136
393k
    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3137
393k
    u[33] = u[32];
3138
393k
    u[38] = u[39];
3139
393k
    u[41] = u[40];
3140
393k
    u[46] = u[47];
3141
393k
    u[49] = u[48];
3142
393k
    u[54] = u[55];
3143
393k
    u[57] = u[56];
3144
393k
    u[62] = u[63];
3145
3146
    // stage 4
3147
393k
    __m256i temp1, temp2;
3148
393k
    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3149
393k
    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3150
393k
    u[17] = u[16];
3151
393k
    u[22] = u[23];
3152
393k
    u[25] = u[24];
3153
393k
    u[30] = u[31];
3154
3155
393k
    temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3156
393k
    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3157
393k
    u[33] = temp1;
3158
3159
393k
    temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3160
393k
    u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3161
393k
    u[57] = temp2;
3162
3163
393k
    temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3164
393k
    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3165
393k
    u[41] = temp1;
3166
3167
393k
    temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3168
393k
    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3169
393k
    u[46] = temp2;
3170
3171
    // stage 5
3172
393k
    u[9] = u[8];
3173
393k
    u[14] = u[15];
3174
3175
393k
    temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3176
393k
    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3177
393k
    u[17] = temp1;
3178
3179
393k
    temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3180
393k
    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3181
393k
    u[22] = temp2;
3182
3183
393k
    u[35] = u[32];
3184
393k
    u[34] = u[33];
3185
393k
    u[36] = u[39];
3186
393k
    u[37] = u[38];
3187
393k
    u[43] = u[40];
3188
393k
    u[42] = u[41];
3189
393k
    u[44] = u[47];
3190
393k
    u[45] = u[46];
3191
393k
    u[51] = u[48];
3192
393k
    u[50] = u[49];
3193
393k
    u[52] = u[55];
3194
393k
    u[53] = u[54];
3195
393k
    u[59] = u[56];
3196
393k
    u[58] = u[57];
3197
393k
    u[60] = u[63];
3198
393k
    u[61] = u[62];
3199
3200
    // stage 6
3201
393k
    temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3202
393k
    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3203
393k
    u[0] = temp1;
3204
3205
393k
    temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3206
393k
    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3207
393k
    u[9] = temp2;
3208
393k
    u[19] = u[16];
3209
393k
    u[18] = u[17];
3210
393k
    u[20] = u[23];
3211
393k
    u[21] = u[22];
3212
393k
    u[27] = u[24];
3213
393k
    u[26] = u[25];
3214
393k
    u[28] = u[31];
3215
393k
    u[29] = u[30];
3216
3217
393k
    temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3218
393k
    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3219
393k
    u[34] = temp1;
3220
393k
    temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3221
393k
    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3222
393k
    u[35] = temp2;
3223
393k
    temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3224
393k
    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3225
393k
    u[36] = temp1;
3226
393k
    temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3227
393k
    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3228
393k
    u[37] = temp2;
3229
393k
    temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3230
393k
    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3231
393k
    u[42] = temp1;
3232
393k
    temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3233
393k
    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3234
393k
    u[43] = temp2;
3235
393k
    temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3236
393k
    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3237
393k
    u[44] = temp1;
3238
393k
    temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3239
393k
    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3240
393k
    u[45] = temp2;
3241
3242
    // stage 7
3243
393k
    u[3] = u[0];
3244
393k
    u[2] = u[1];
3245
393k
    u[11] = u[8];
3246
393k
    u[10] = u[9];
3247
393k
    u[12] = u[15];
3248
393k
    u[13] = u[14];
3249
3250
393k
    temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3251
393k
    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3252
393k
    u[18] = temp1;
3253
393k
    temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3254
393k
    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3255
393k
    u[19] = temp2;
3256
393k
    temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3257
393k
    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3258
393k
    u[20] = temp1;
3259
393k
    temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3260
393k
    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3261
393k
    u[21] = temp2;
3262
1.18M
    for (i = 32; i < 64; i += 16) {
3263
3.93M
      for (j = i; j < i + 4; j++) {
3264
3.14M
        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3265
3.14M
        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3266
3.14M
                    &clamp_hi);
3267
3.14M
      }
3268
787k
    }
3269
3270
    // stage 8
3271
393k
    u[7] = u[0];
3272
393k
    u[6] = u[1];
3273
393k
    u[5] = u[2];
3274
393k
    u[4] = u[3];
3275
3276
393k
    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3277
393k
                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3278
3279
    // stage 9
3280
393k
    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3281
393k
                       bit);
3282
3283
    // stage 10
3284
393k
    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3285
393k
                        bit);
3286
3287
    // stage 11
3288
393k
    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3289
393k
  }
3290
393k
}
3291
static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3292
326k
                              int bd, int out_shift) {
3293
326k
  int i, j;
3294
326k
  const int32_t *cospi = cospi_arr(bit);
3295
326k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3296
326k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3297
326k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3298
326k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3299
3300
326k
  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3301
326k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3302
326k
  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3303
326k
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3304
326k
  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3305
326k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3306
326k
  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3307
326k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3308
326k
  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3309
326k
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3310
326k
  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3311
326k
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3312
326k
  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3313
326k
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3314
326k
  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3315
326k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3316
326k
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3317
326k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3318
326k
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3319
326k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3320
326k
  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3321
326k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3322
326k
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3323
326k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3324
326k
  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3325
326k
  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3326
326k
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3327
326k
  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3328
326k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3329
326k
  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3330
326k
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3331
326k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3332
326k
  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3333
3334
326k
  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3335
326k
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3336
326k
  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3337
326k
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3338
326k
  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3339
326k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3340
326k
  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3341
326k
  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3342
326k
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3343
326k
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3344
326k
  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3345
326k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3346
326k
  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3347
326k
  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3348
326k
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3349
326k
  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3350
326k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3351
326k
  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3352
326k
  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3353
326k
  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3354
326k
  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3355
3356
326k
  {
3357
326k
    __m256i u[64];
3358
326k
    __m256i tmp1, tmp2, tmp3, tmp4;
3359
    // stage 1
3360
326k
    u[0] = in[0];
3361
326k
    u[32] = in[1];
3362
326k
    u[36] = in[9];
3363
326k
    u[40] = in[5];
3364
326k
    u[44] = in[13];
3365
326k
    u[48] = in[3];
3366
326k
    u[52] = in[11];
3367
326k
    u[56] = in[7];
3368
326k
    u[60] = in[15];
3369
326k
    u[16] = in[2];
3370
326k
    u[20] = in[10];
3371
326k
    u[24] = in[6];
3372
326k
    u[28] = in[14];
3373
326k
    u[4] = in[8];
3374
326k
    u[8] = in[4];
3375
326k
    u[12] = in[12];
3376
3377
    // stage 2
3378
326k
    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3379
326k
    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3380
326k
    u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3381
326k
    u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3382
326k
    u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3383
326k
    u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3384
326k
    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3385
326k
    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3386
326k
    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3387
326k
    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3388
326k
    u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3389
326k
    u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3390
326k
    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3391
326k
    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3392
326k
    u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3393
326k
    u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3394
3395
    // stage 3
3396
326k
    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3397
326k
    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3398
326k
    u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
3399
326k
    u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
3400
326k
    u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
3401
326k
    u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
3402
326k
    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3403
326k
    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3404
326k
    u[33] = u[32];
3405
326k
    u[34] = u[35];
3406
326k
    u[37] = u[36];
3407
326k
    u[38] = u[39];
3408
326k
    u[41] = u[40];
3409
326k
    u[42] = u[43];
3410
326k
    u[45] = u[44];
3411
326k
    u[46] = u[47];
3412
326k
    u[49] = u[48];
3413
326k
    u[50] = u[51];
3414
326k
    u[53] = u[52];
3415
326k
    u[54] = u[55];
3416
326k
    u[57] = u[56];
3417
326k
    u[58] = u[59];
3418
326k
    u[61] = u[60];
3419
326k
    u[62] = u[63];
3420
3421
    // stage 4
3422
326k
    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3423
326k
    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3424
326k
    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3425
326k
    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3426
3427
326k
    u[17] = u[16];
3428
326k
    u[18] = u[19];
3429
326k
    u[21] = u[20];
3430
326k
    u[22] = u[23];
3431
326k
    u[25] = u[24];
3432
326k
    u[26] = u[27];
3433
326k
    u[29] = u[28];
3434
326k
    u[30] = u[31];
3435
3436
326k
    tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3437
326k
    tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3438
326k
    tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3439
326k
    tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3440
326k
    u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3441
326k
    u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3442
326k
    u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3443
326k
    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3444
326k
    u[33] = tmp1;
3445
326k
    u[34] = tmp2;
3446
326k
    u[37] = tmp3;
3447
326k
    u[38] = tmp4;
3448
3449
326k
    tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3450
326k
    tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3451
326k
    tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3452
326k
    tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3453
326k
    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3454
326k
    u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3455
326k
    u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3456
326k
    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3457
326k
    u[41] = tmp1;
3458
326k
    u[42] = tmp2;
3459
326k
    u[45] = tmp3;
3460
326k
    u[46] = tmp4;
3461
3462
    // stage 5
3463
326k
    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
3464
326k
    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
3465
3466
326k
    u[9] = u[8];
3467
326k
    u[10] = u[11];
3468
326k
    u[13] = u[12];
3469
326k
    u[14] = u[15];
3470
3471
326k
    tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3472
326k
    tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3473
326k
    tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3474
326k
    tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3475
326k
    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3476
326k
    u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3477
326k
    u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3478
326k
    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3479
326k
    u[17] = tmp1;
3480
326k
    u[18] = tmp2;
3481
326k
    u[21] = tmp3;
3482
326k
    u[22] = tmp4;
3483
3484
1.63M
    for (i = 32; i < 64; i += 8) {
3485
1.30M
      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3486
1.30M
                  &clamp_hi);
3487
1.30M
      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3488
1.30M
                  &clamp_hi);
3489
3490
1.30M
      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3491
1.30M
                  &clamp_hi);
3492
1.30M
      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3493
1.30M
                  &clamp_hi);
3494
1.30M
    }
3495
3496
    // stage 6
3497
326k
    tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3498
326k
    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3499
326k
    u[0] = tmp1;
3500
326k
    u[5] = u[4];
3501
326k
    u[6] = u[7];
3502
3503
326k
    tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3504
326k
    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3505
326k
    u[9] = tmp1;
3506
326k
    tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3507
326k
    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3508
326k
    u[10] = tmp2;
3509
3510
978k
    for (i = 16; i < 32; i += 8) {
3511
652k
      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3512
652k
                  &clamp_hi);
3513
652k
      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3514
652k
                  &clamp_hi);
3515
3516
652k
      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3517
652k
                  &clamp_hi);
3518
652k
      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3519
652k
                  &clamp_hi);
3520
652k
    }
3521
3522
326k
    tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3523
326k
    tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3524
326k
    tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3525
326k
    tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3526
326k
    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3527
326k
    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3528
326k
    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3529
326k
    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3530
326k
    u[34] = tmp1;
3531
326k
    u[35] = tmp2;
3532
326k
    u[36] = tmp3;
3533
326k
    u[37] = tmp4;
3534
3535
326k
    tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3536
326k
    tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3537
326k
    tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3538
326k
    tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3539
326k
    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3540
326k
    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3541
326k
    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3542
326k
    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3543
326k
    u[42] = tmp1;
3544
326k
    u[43] = tmp2;
3545
326k
    u[44] = tmp3;
3546
326k
    u[45] = tmp4;
3547
3548
    // stage 7
3549
326k
    u[3] = u[0];
3550
326k
    u[2] = u[1];
3551
326k
    tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3552
326k
    u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3553
326k
    u[5] = tmp1;
3554
326k
    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3555
326k
    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3556
326k
    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3557
326k
    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3558
3559
326k
    tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3560
326k
    tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3561
326k
    tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3562
326k
    tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3563
326k
    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3564
326k
    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3565
326k
    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3566
326k
    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3567
326k
    u[18] = tmp1;
3568
326k
    u[19] = tmp2;
3569
326k
    u[20] = tmp3;
3570
326k
    u[21] = tmp4;
3571
3572
978k
    for (i = 32; i < 64; i += 16) {
3573
3.26M
      for (j = i; j < i + 4; j++) {
3574
2.60M
        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3575
2.60M
        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3576
2.60M
                    &clamp_hi);
3577
2.60M
      }
3578
652k
    }
3579
3580
    // stage 8
3581
1.63M
    for (i = 0; i < 4; ++i) {
3582
1.30M
      addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3583
1.30M
    }
3584
3585
326k
    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3586
326k
                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3587
3588
    // stage 9
3589
326k
    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3590
326k
                       bit);
3591
3592
    // stage 10
3593
326k
    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3594
326k
                        bit);
3595
3596
    // stage 11
3597
326k
    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3598
326k
  }
3599
326k
}
3600
static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
3601
250k
                        int out_shift) {
3602
250k
  int i, j;
3603
250k
  const int32_t *cospi = cospi_arr(bit);
3604
250k
  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3605
250k
  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3606
250k
  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3607
250k
  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3608
3609
250k
  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3610
250k
  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3611
250k
  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3612
250k
  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3613
250k
  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3614
250k
  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3615
250k
  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3616
250k
  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3617
250k
  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3618
250k
  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3619
250k
  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3620
250k
  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3621
250k
  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3622
250k
  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3623
250k
  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3624
250k
  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3625
250k
  const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
3626
250k
  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3627
250k
  const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
3628
250k
  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3629
250k
  const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
3630
250k
  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3631
250k
  const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
3632
250k
  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3633
250k
  const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
3634
250k
  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3635
250k
  const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
3636
250k
  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3637
250k
  const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
3638
250k
  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3639
250k
  const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
3640
250k
  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3641
250k
  const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
3642
250k
  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3643
250k
  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3644
250k
  const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
3645
250k
  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3646
250k
  const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
3647
250k
  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3648
250k
  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3649
250k
  const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
3650
250k
  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3651
250k
  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3652
250k
  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3653
250k
  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3654
250k
  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3655
250k
  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3656
250k
  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3657
250k
  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3658
250k
  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3659
250k
  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3660
3661
250k
  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3662
250k
  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3663
250k
  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3664
250k
  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3665
250k
  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3666
250k
  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3667
250k
  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3668
250k
  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3669
250k
  const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
3670
250k
  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
3671
250k
  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3672
250k
  const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
3673
250k
  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3674
250k
  const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
3675
250k
  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
3676
250k
  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3677
250k
  const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
3678
250k
  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3679
250k
  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3680
250k
  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3681
250k
  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3682
250k
  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3683
250k
  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3684
250k
  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3685
250k
  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3686
250k
  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3687
250k
  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3688
3689
250k
  {
3690
250k
    __m256i u[64], v[64];
3691
3692
    // stage 1
3693
250k
    u[32] = in[1];
3694
250k
    u[34] = in[17];
3695
250k
    u[36] = in[9];
3696
250k
    u[38] = in[25];
3697
250k
    u[40] = in[5];
3698
250k
    u[42] = in[21];
3699
250k
    u[44] = in[13];
3700
250k
    u[46] = in[29];
3701
250k
    u[48] = in[3];
3702
250k
    u[50] = in[19];
3703
250k
    u[52] = in[11];
3704
250k
    u[54] = in[27];
3705
250k
    u[56] = in[7];
3706
250k
    u[58] = in[23];
3707
250k
    u[60] = in[15];
3708
250k
    u[62] = in[31];
3709
3710
250k
    v[16] = in[2];
3711
250k
    v[18] = in[18];
3712
250k
    v[20] = in[10];
3713
250k
    v[22] = in[26];
3714
250k
    v[24] = in[6];
3715
250k
    v[26] = in[22];
3716
250k
    v[28] = in[14];
3717
250k
    v[30] = in[30];
3718
3719
250k
    u[8] = in[4];
3720
250k
    u[10] = in[20];
3721
250k
    u[12] = in[12];
3722
250k
    u[14] = in[28];
3723
3724
250k
    v[4] = in[8];
3725
250k
    v[6] = in[24];
3726
3727
250k
    u[0] = in[0];
3728
250k
    u[2] = in[16];
3729
3730
    // stage 2
3731
250k
    v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3732
250k
    v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
3733
250k
    v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
3734
250k
    v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3735
250k
    v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3736
250k
    v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
3737
250k
    v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
3738
250k
    v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3739
250k
    v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3740
250k
    v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
3741
250k
    v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
3742
250k
    v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3743
250k
    v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3744
250k
    v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
3745
250k
    v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
3746
250k
    v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3747
250k
    v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3748
250k
    v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
3749
250k
    v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
3750
250k
    v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3751
250k
    v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3752
250k
    v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
3753
250k
    v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
3754
250k
    v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3755
250k
    v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3756
250k
    v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
3757
250k
    v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
3758
250k
    v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3759
250k
    v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3760
250k
    v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
3761
250k
    v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
3762
250k
    v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3763
3764
    // stage 3
3765
250k
    u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
3766
250k
    u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
3767
250k
    u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
3768
250k
    u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
3769
250k
    u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
3770
250k
    u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
3771
250k
    u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
3772
250k
    u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
3773
250k
    u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
3774
250k
    u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
3775
250k
    u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
3776
250k
    u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
3777
250k
    u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
3778
250k
    u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
3779
250k
    u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
3780
250k
    u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
3781
3782
2.25M
    for (i = 32; i < 64; i += 4) {
3783
2.00M
      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3784
2.00M
                  &clamp_hi);
3785
2.00M
      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3786
2.00M
                  &clamp_hi);
3787
2.00M
    }
3788
3789
    // stage 4
3790
250k
    v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3791
250k
    v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
3792
250k
    v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
3793
250k
    v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3794
250k
    v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3795
250k
    v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
3796
250k
    v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
3797
250k
    v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3798
3799
1.25M
    for (i = 16; i < 32; i += 4) {
3800
1.00M
      addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3801
1.00M
                  &clamp_hi);
3802
1.00M
      addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3803
1.00M
                  &clamp_hi);
3804
1.00M
    }
3805
3806
2.25M
    for (i = 32; i < 64; i += 4) {
3807
2.00M
      v[i + 0] = u[i + 0];
3808
2.00M
      v[i + 3] = u[i + 3];
3809
2.00M
    }
3810
3811
250k
    v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3812
250k
    v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3813
250k
    v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3814
250k
    v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3815
250k
    v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3816
250k
    v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3817
250k
    v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3818
250k
    v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3819
250k
    v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3820
250k
    v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3821
250k
    v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3822
250k
    v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3823
250k
    v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3824
250k
    v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3825
250k
    v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3826
250k
    v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3827
3828
    // stage 5
3829
250k
    u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
3830
250k
    u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
3831
250k
    u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
3832
250k
    u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
3833
3834
750k
    for (i = 8; i < 16; i += 4) {
3835
500k
      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3836
500k
                  &clamp_hi);
3837
500k
      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3838
500k
                  &clamp_hi);
3839
500k
    }
3840
3841
1.25M
    for (i = 16; i < 32; i += 4) {
3842
1.00M
      u[i + 0] = v[i + 0];
3843
1.00M
      u[i + 3] = v[i + 3];
3844
1.00M
    }
3845
3846
250k
    u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
3847
250k
    u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
3848
250k
    u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
3849
250k
    u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
3850
250k
    u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
3851
250k
    u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
3852
250k
    u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
3853
250k
    u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
3854
3855
1.25M
    for (i = 32; i < 64; i += 8) {
3856
1.00M
      addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3857
1.00M
                  &clamp_hi);
3858
1.00M
      addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3859
1.00M
                  &clamp_hi);
3860
3861
1.00M
      addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3862
1.00M
                  &clamp_hi);
3863
1.00M
      addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3864
1.00M
                  &clamp_hi);
3865
1.00M
    }
3866
3867
    // stage 6
3868
250k
    v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3869
250k
    v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3870
250k
    v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
3871
250k
    v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
3872
3873
250k
    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3874
250k
    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3875
3876
750k
    for (i = 8; i < 16; i += 4) {
3877
500k
      v[i + 0] = u[i + 0];
3878
500k
      v[i + 3] = u[i + 3];
3879
500k
    }
3880
3881
250k
    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3882
250k
    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3883
250k
    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3884
250k
    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3885
3886
750k
    for (i = 16; i < 32; i += 8) {
3887
500k
      addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3888
500k
                  &clamp_hi);
3889
500k
      addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3890
500k
                  &clamp_hi);
3891
3892
500k
      addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
3893
500k
                  &clamp_hi);
3894
500k
      addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
3895
500k
                  &clamp_hi);
3896
500k
    }
3897
3898
1.25M
    for (i = 32; i < 64; i += 8) {
3899
1.00M
      v[i + 0] = u[i + 0];
3900
1.00M
      v[i + 1] = u[i + 1];
3901
1.00M
      v[i + 6] = u[i + 6];
3902
1.00M
      v[i + 7] = u[i + 7];
3903
1.00M
    }
3904
3905
250k
    v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3906
250k
    v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3907
250k
    v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3908
250k
    v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3909
250k
    v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3910
250k
    v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3911
250k
    v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3912
250k
    v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3913
250k
    v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3914
250k
    v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3915
250k
    v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3916
250k
    v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3917
250k
    v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3918
250k
    v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3919
250k
    v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3920
250k
    v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3921
3922
    // stage 7
3923
250k
    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3924
250k
    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3925
3926
250k
    u[4] = v[4];
3927
250k
    u[7] = v[7];
3928
250k
    u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
3929
250k
    u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
3930
3931
250k
    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3932
250k
    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3933
250k
    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3934
250k
    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3935
3936
750k
    for (i = 16; i < 32; i += 8) {
3937
500k
      u[i + 0] = v[i + 0];
3938
500k
      u[i + 1] = v[i + 1];
3939
500k
      u[i + 6] = v[i + 6];
3940
500k
      u[i + 7] = v[i + 7];
3941
500k
    }
3942
3943
250k
    u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
3944
250k
    u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
3945
250k
    u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
3946
250k
    u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
3947
250k
    u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
3948
250k
    u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
3949
250k
    u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
3950
250k
    u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
3951
3952
750k
    for (i = 32; i < 64; i += 16) {
3953
2.50M
      for (j = i; j < i + 4; j++) {
3954
2.00M
        addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3955
2.00M
        addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3956
2.00M
                    &clamp_hi);
3957
2.00M
      }
3958
500k
    }
3959
3960
    // stage 8
3961
1.25M
    for (i = 0; i < 4; ++i) {
3962
1.00M
      addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
3963
1.00M
    }
3964
3965
250k
    v[8] = u[8];
3966
250k
    v[9] = u[9];
3967
250k
    v[14] = u[14];
3968
250k
    v[15] = u[15];
3969
3970
250k
    v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
3971
250k
    v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
3972
250k
    v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
3973
250k
    v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
3974
3975
1.25M
    for (i = 16; i < 20; ++i) {
3976
1.00M
      addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
3977
1.00M
      addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
3978
1.00M
                  &clamp_hi);
3979
1.00M
    }
3980
3981
1.25M
    for (i = 32; i < 36; ++i) {
3982
1.00M
      v[i] = u[i];
3983
1.00M
      v[i + 12] = u[i + 12];
3984
1.00M
      v[i + 16] = u[i + 16];
3985
1.00M
      v[i + 28] = u[i + 28];
3986
1.00M
    }
3987
3988
250k
    v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
3989
250k
    v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
3990
250k
    v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
3991
250k
    v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
3992
250k
    v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
3993
250k
    v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
3994
250k
    v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
3995
250k
    v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
3996
250k
    v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
3997
250k
    v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
3998
250k
    v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
3999
250k
    v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4000
250k
    v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4001
250k
    v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4002
250k
    v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4003
250k
    v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4004
4005
    // stage 9
4006
2.25M
    for (i = 0; i < 8; ++i) {
4007
2.00M
      addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4008
2.00M
    }
4009
4010
1.25M
    for (i = 16; i < 20; ++i) {
4011
1.00M
      u[i] = v[i];
4012
1.00M
      u[i + 12] = v[i + 12];
4013
1.00M
    }
4014
4015
250k
    u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4016
250k
    u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4017
250k
    u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4018
250k
    u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4019
250k
    u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4020
250k
    u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4021
250k
    u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4022
250k
    u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4023
4024
2.25M
    for (i = 32; i < 40; i++) {
4025
2.00M
      addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4026
2.00M
    }
4027
4028
2.25M
    for (i = 48; i < 56; i++) {
4029
2.00M
      addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4030
2.00M
    }
4031
4032
    // stage 10
4033
4.25M
    for (i = 0; i < 16; i++) {
4034
4.00M
      addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4035
4.00M
    }
4036
4037
2.25M
    for (i = 32; i < 40; i++) v[i] = u[i];
4038
4039
250k
    v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4040
250k
    v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4041
250k
    v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4042
250k
    v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4043
250k
    v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4044
250k
    v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4045
250k
    v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4046
250k
    v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4047
250k
    v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4048
250k
    v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4049
250k
    v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4050
250k
    v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4051
250k
    v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4052
250k
    v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4053
250k
    v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4054
250k
    v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4055
4056
2.25M
    for (i = 56; i < 64; i++) v[i] = u[i];
4057
4058
    // stage 11
4059
8.25M
    for (i = 0; i < 32; i++) {
4060
8.00M
      addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4061
8.00M
                  &clamp_hi);
4062
8.00M
    }
4063
250k
    if (!do_cols) {
4064
112k
      const int log_range_out = AOMMAX(16, bd + 6);
4065
112k
      const __m256i clamp_lo_out =
4066
112k
          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
4067
112k
      const __m256i clamp_hi_out =
4068
112k
          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
4069
4070
112k
      round_shift_8x8_avx2(out, out_shift);
4071
112k
      round_shift_8x8_avx2(out + 16, out_shift);
4072
112k
      round_shift_8x8_avx2(out + 32, out_shift);
4073
112k
      round_shift_8x8_avx2(out + 48, out_shift);
4074
112k
      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
4075
112k
    }
4076
250k
  }
4077
250k
}
4078
typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
4079
                                  int do_cols, int bd, int out_shift);
4080
4081
static const transform_1d_avx2
4082
    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4083
      {
4084
          { NULL, NULL, NULL, NULL },
4085
          { NULL, NULL, NULL, NULL },
4086
          { NULL, NULL, NULL, NULL },
4087
      },
4088
      {
4089
          { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
4090
          { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
4091
          { NULL, NULL, NULL, NULL },
4092
      },
4093
      {
4094
          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
4095
          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
4096
          { NULL, NULL, NULL, NULL },
4097
      },
4098
      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
4099
        { NULL, NULL, NULL, NULL },
4100
        { NULL, NULL, NULL, NULL } },
4101
4102
      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
4103
        { NULL, NULL, NULL, NULL },
4104
        { NULL, NULL, NULL, NULL } }
4105
    };
4106
4107
static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
4108
                                                   uint16_t *output, int stride,
4109
                                                   TX_TYPE tx_type,
4110
                                                   TX_SIZE tx_size, int eob,
4111
4.66M
                                                   const int bd) {
4112
4.66M
  __m256i buf1[64 * 8];
4113
4.66M
  int eobx, eoby;
4114
4.66M
  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4115
4.66M
  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4116
4.66M
  const int txw_idx = get_txw_idx(tx_size);
4117
4.66M
  const int txh_idx = get_txh_idx(tx_size);
4118
4.66M
  const int txfm_size_col = tx_size_wide[tx_size];
4119
4.66M
  const int txfm_size_row = tx_size_high[tx_size];
4120
4.66M
  const int buf_size_w_div8 = txfm_size_col >> 3;
4121
4.66M
  const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4122
4.66M
  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4123
4.66M
  const int input_stride = AOMMIN(32, txfm_size_row);
4124
4.66M
  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4125
4.66M
  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4126
4.66M
  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4127
4.66M
  const transform_1d_avx2 row_txfm =
4128
4.66M
      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4129
4.66M
  const transform_1d_avx2 col_txfm =
4130
4.66M
      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4131
4132
4.66M
  assert(col_txfm != NULL);
4133
4.66M
  assert(row_txfm != NULL);
4134
4.66M
  int ud_flip, lr_flip;
4135
4.66M
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4136
4137
  // 1st stage: column transform
4138
9.96M
  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4139
5.30M
    __m256i buf0[64];
4140
5.30M
    load_buffer_32bit_input(input + i * 8, input_stride, buf0,
4141
5.30M
                            buf_size_nonzero_w);
4142
5.30M
    if (rect_type == 1 || rect_type == -1) {
4143
1.26M
      round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
4144
1.26M
                                     NewInvSqrt2);
4145
1.26M
    }
4146
5.30M
    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4147
4148
5.30M
    __m256i *_buf1 = buf1 + i * 8;
4149
5.30M
    if (lr_flip) {
4150
304k
      for (int j = 0; j < buf_size_w_div8; ++j) {
4151
178k
        transpose_8x8_flip_avx2(
4152
178k
            &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
4153
178k
      }
4154
5.17M
    } else {
4155
19.0M
      for (int j = 0; j < buf_size_w_div8; ++j) {
4156
13.9M
        transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
4157
13.9M
      }
4158
5.17M
    }
4159
5.30M
  }
4160
  // 2nd stage: column transform
4161
16.3M
  for (int i = 0; i < buf_size_w_div8; i++) {
4162
11.6M
    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
4163
11.6M
             bd, 0);
4164
4165
11.6M
    round_shift_array_32_avx2(buf1 + i * txfm_size_row,
4166
11.6M
                              buf1 + i * txfm_size_row, txfm_size_row,
4167
11.6M
                              -shift[1]);
4168
11.6M
  }
4169
4170
  // write to buffer
4171
4.66M
  if (txfm_size_col >= 16) {
4172
7.97M
    for (int i = 0; i < (txfm_size_col >> 4); i++) {
4173
5.00M
      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
4174
5.00M
                                    output + 16 * i, stride, ud_flip,
4175
5.00M
                                    txfm_size_row, bd);
4176
5.00M
    }
4177
2.97M
  } else if (txfm_size_col == 8) {
4178
1.68M
    highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
4179
1.68M
                                 bd);
4180
1.68M
  }
4181
4.66M
}
4182
4183
static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
4184
                                                    uint8_t *output, int stride,
4185
                                                    TX_TYPE tx_type,
4186
                                                    TX_SIZE tx_size, int eob,
4187
5.28M
                                                    const int bd) {
4188
5.28M
  switch (tx_type) {
4189
2.84M
    case DCT_DCT:
4190
3.28M
    case ADST_DCT:
4191
3.95M
    case DCT_ADST:
4192
4.46M
    case ADST_ADST:
4193
4.51M
    case FLIPADST_DCT:
4194
4.57M
    case DCT_FLIPADST:
4195
4.59M
    case FLIPADST_FLIPADST:
4196
4.62M
    case ADST_FLIPADST:
4197
4.66M
    case FLIPADST_ADST:
4198
4.66M
      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
4199
4.66M
                                             stride, tx_type, tx_size, eob, bd);
4200
4.66M
      break;
4201
334k
    case IDTX:
4202
467k
    case H_DCT:
4203
491k
    case H_ADST:
4204
499k
    case H_FLIPADST:
4205
540k
    case V_DCT:
4206
619k
    case V_ADST:
4207
622k
    case V_FLIPADST:
4208
622k
      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
4209
622k
                                                tx_size, eob, bd);
4210
622k
      break;
4211
0
    default: assert(0); break;
4212
5.28M
  }
4213
5.28M
}
4214
void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
4215
10.9M
                                  int stride, const TxfmParam *txfm_param) {
4216
10.9M
  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4217
10.9M
  const TX_SIZE tx_size = txfm_param->tx_size;
4218
10.9M
  switch (tx_size) {
4219
661k
    case TX_4X8:
4220
1.58M
    case TX_8X4:
4221
4.57M
    case TX_4X4:
4222
5.23M
    case TX_16X4:
4223
5.67M
    case TX_4X16:
4224
5.67M
      av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
4225
5.67M
      break;
4226
5.28M
    default:
4227
5.28M
      av1_highbd_inv_txfm2d_add_universe_avx2(
4228
5.28M
          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
4229
5.28M
          txfm_param->eob, txfm_param->bd);
4230
5.28M
      break;
4231
10.9M
  }
4232
10.9M
}