Coverage Report

Created: 2024-06-18 06:48

/src/aom/av1/common/x86/av1_inv_txfm_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include "config/aom_config.h"
13
14
#include "config/av1_rtcd.h"
15
16
#include "av1/common/av1_inv_txfm1d_cfg.h"
17
#include "av1/common/x86/av1_txfm_sse2.h"
18
#include "av1/common/x86/av1_inv_txfm_avx2.h"
19
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
20
21
// TODO(venkatsanampudi@ittiam.com): move this to header file
22
23
// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
24
static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
25
                                          4 * 5793 };
26
27
static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
28
1.40M
                                      const __m256i _r, int8_t cos_bit) {
29
1.40M
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
30
1.40M
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
31
1.40M
  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
32
1.40M
  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
33
1.40M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
34
35
1.40M
  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
36
1.40M
  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
37
1.40M
  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
38
1.40M
  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
39
1.40M
}
40
41
static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
42
1.40M
                                      const __m256i _r, int8_t cos_bit) {
43
1.40M
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
44
1.40M
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
45
1.40M
  btf_16_adds_subs_avx2(&x[0], &x[7]);
46
1.40M
  btf_16_adds_subs_avx2(&x[1], &x[6]);
47
1.40M
  btf_16_adds_subs_avx2(&x[2], &x[5]);
48
1.40M
  btf_16_adds_subs_avx2(&x[3], &x[4]);
49
1.40M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
50
1.40M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
51
1.40M
}
52
53
1.40M
static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
54
1.40M
  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
55
1.40M
  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
56
1.40M
  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
57
1.40M
  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
58
1.40M
  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
59
1.40M
  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
60
1.40M
  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
61
1.40M
  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
62
1.40M
}
63
64
459k
static void idct16_avx2(const __m256i *input, __m256i *output) {
65
459k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
66
459k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
67
68
459k
  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
69
459k
  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
70
459k
  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
71
459k
  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
72
459k
  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
73
459k
  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
74
459k
  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
75
459k
  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
76
459k
  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
77
459k
  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
78
459k
  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
79
459k
  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
80
459k
  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
81
459k
  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
82
459k
  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
83
459k
  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
84
459k
  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
85
459k
  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
86
459k
  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
87
88
  // stage 1
89
459k
  __m256i x1[16];
90
459k
  x1[0] = input[0];
91
459k
  x1[1] = input[8];
92
459k
  x1[2] = input[4];
93
459k
  x1[3] = input[12];
94
459k
  x1[4] = input[2];
95
459k
  x1[5] = input[10];
96
459k
  x1[6] = input[6];
97
459k
  x1[7] = input[14];
98
459k
  x1[8] = input[1];
99
459k
  x1[9] = input[9];
100
459k
  x1[10] = input[5];
101
459k
  x1[11] = input[13];
102
459k
  x1[12] = input[3];
103
459k
  x1[13] = input[11];
104
459k
  x1[14] = input[7];
105
459k
  x1[15] = input[15];
106
107
  // stage 2
108
459k
  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
109
459k
                  INV_COS_BIT);
110
459k
  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
111
459k
                  INV_COS_BIT);
112
459k
  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
113
459k
                  INV_COS_BIT);
114
459k
  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
115
459k
                  INV_COS_BIT);
116
117
  // stage 3
118
459k
  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
119
459k
                  INV_COS_BIT);
120
459k
  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
121
459k
                  INV_COS_BIT);
122
459k
  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
123
459k
  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
124
459k
  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
125
459k
  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
126
127
  // stage 4
128
459k
  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
129
459k
                  INV_COS_BIT);
130
459k
  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
131
459k
                  INV_COS_BIT);
132
459k
  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
133
459k
  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
134
459k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
135
459k
                  INV_COS_BIT);
136
459k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
137
459k
                  INV_COS_BIT);
138
139
459k
  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
140
459k
  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
141
459k
  idct16_stage7_avx2(output, x1);
142
459k
}
143
144
944k
static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
145
944k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
146
944k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
147
148
944k
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
149
944k
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
150
944k
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
151
152
  // stage 1
153
944k
  __m256i x1[16];
154
944k
  x1[0] = input[0];
155
944k
  x1[2] = input[4];
156
944k
  x1[4] = input[2];
157
944k
  x1[6] = input[6];
158
944k
  x1[8] = input[1];
159
944k
  x1[10] = input[5];
160
944k
  x1[12] = input[3];
161
944k
  x1[14] = input[7];
162
163
  // stage 2
164
944k
  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
165
944k
  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
166
944k
  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
167
944k
  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
168
169
  // stage 3
170
944k
  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
171
944k
  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
172
944k
  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
173
944k
  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
174
944k
  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
175
944k
  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
176
177
  // stage 4
178
944k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
179
944k
  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
180
944k
  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
181
944k
  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
182
944k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
183
944k
                  INV_COS_BIT);
184
944k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
185
944k
                  INV_COS_BIT);
186
187
944k
  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
188
944k
  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
189
944k
  idct16_stage7_avx2(output, x1);
190
944k
}
191
192
640k
static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
193
640k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
194
195
  // stage 1
196
640k
  __m256i x1[2];
197
640k
  x1[0] = input[0];
198
199
  // stage 2
200
  // stage 3
201
  // stage 4
202
640k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
203
204
  // stage 5
205
  // stage 6
206
640k
  output[0] = x1[0];
207
640k
  output[1] = x1[1];
208
640k
  output[2] = x1[1];
209
640k
  output[3] = x1[0];
210
640k
  output[4] = x1[0];
211
640k
  output[5] = x1[1];
212
640k
  output[6] = x1[1];
213
640k
  output[7] = x1[0];
214
640k
  output[8] = x1[0];
215
640k
  output[9] = x1[1];
216
640k
  output[10] = x1[1];
217
640k
  output[11] = x1[0];
218
640k
  output[12] = x1[0];
219
640k
  output[13] = x1[1];
220
640k
  output[14] = x1[1];
221
640k
  output[15] = x1[0];
222
640k
}
223
224
458k
static INLINE void iadst16_stage3_avx2(__m256i *x) {
225
458k
  btf_16_adds_subs_avx2(&x[0], &x[8]);
226
458k
  btf_16_adds_subs_avx2(&x[1], &x[9]);
227
458k
  btf_16_adds_subs_avx2(&x[2], &x[10]);
228
458k
  btf_16_adds_subs_avx2(&x[3], &x[11]);
229
458k
  btf_16_adds_subs_avx2(&x[4], &x[12]);
230
458k
  btf_16_adds_subs_avx2(&x[5], &x[13]);
231
458k
  btf_16_adds_subs_avx2(&x[6], &x[14]);
232
458k
  btf_16_adds_subs_avx2(&x[7], &x[15]);
233
458k
}
234
235
static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
236
458k
                                       const __m256i _r, int8_t cos_bit) {
237
458k
  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
238
458k
  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
239
458k
  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
240
458k
  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
241
458k
  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
242
458k
  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
243
458k
  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
244
458k
  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
245
458k
  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
246
458k
  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
247
458k
}
248
249
458k
static INLINE void iadst16_stage5_avx2(__m256i *x) {
250
458k
  btf_16_adds_subs_avx2(&x[0], &x[4]);
251
458k
  btf_16_adds_subs_avx2(&x[1], &x[5]);
252
458k
  btf_16_adds_subs_avx2(&x[2], &x[6]);
253
458k
  btf_16_adds_subs_avx2(&x[3], &x[7]);
254
458k
  btf_16_adds_subs_avx2(&x[8], &x[12]);
255
458k
  btf_16_adds_subs_avx2(&x[9], &x[13]);
256
458k
  btf_16_adds_subs_avx2(&x[10], &x[14]);
257
458k
  btf_16_adds_subs_avx2(&x[11], &x[15]);
258
458k
}
259
260
static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
261
458k
                                       const __m256i _r, int8_t cos_bit) {
262
458k
  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
263
458k
  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
264
458k
  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
265
458k
  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
266
458k
  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
267
458k
  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
268
458k
  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
269
458k
}
270
271
458k
static INLINE void iadst16_stage7_avx2(__m256i *x) {
272
458k
  btf_16_adds_subs_avx2(&x[0], &x[2]);
273
458k
  btf_16_adds_subs_avx2(&x[1], &x[3]);
274
458k
  btf_16_adds_subs_avx2(&x[4], &x[6]);
275
458k
  btf_16_adds_subs_avx2(&x[5], &x[7]);
276
458k
  btf_16_adds_subs_avx2(&x[8], &x[10]);
277
458k
  btf_16_adds_subs_avx2(&x[9], &x[11]);
278
458k
  btf_16_adds_subs_avx2(&x[12], &x[14]);
279
458k
  btf_16_adds_subs_avx2(&x[13], &x[15]);
280
458k
}
281
282
static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
283
577k
                                       const __m256i _r, int8_t cos_bit) {
284
577k
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
285
577k
  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
286
577k
  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
287
577k
  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
288
577k
  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
289
577k
  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
290
577k
}
291
292
577k
static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
293
577k
  const __m256i __zero = _mm256_setzero_si256();
294
577k
  output[0] = x1[0];
295
577k
  output[1] = _mm256_subs_epi16(__zero, x1[8]);
296
577k
  output[2] = x1[12];
297
577k
  output[3] = _mm256_subs_epi16(__zero, x1[4]);
298
577k
  output[4] = x1[6];
299
577k
  output[5] = _mm256_subs_epi16(__zero, x1[14]);
300
577k
  output[6] = x1[10];
301
577k
  output[7] = _mm256_subs_epi16(__zero, x1[2]);
302
577k
  output[8] = x1[3];
303
577k
  output[9] = _mm256_subs_epi16(__zero, x1[11]);
304
577k
  output[10] = x1[15];
305
577k
  output[11] = _mm256_subs_epi16(__zero, x1[7]);
306
577k
  output[12] = x1[5];
307
577k
  output[13] = _mm256_subs_epi16(__zero, x1[13]);
308
577k
  output[14] = x1[9];
309
577k
  output[15] = _mm256_subs_epi16(__zero, x1[1]);
310
577k
}
311
312
128k
static void iadst16_avx2(const __m256i *input, __m256i *output) {
313
128k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
314
315
128k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
316
317
128k
  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
318
128k
  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
319
128k
  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
320
128k
  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
321
128k
  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
322
128k
  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
323
128k
  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
324
128k
  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
325
128k
  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
326
128k
  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
327
128k
  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
328
128k
  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
329
128k
  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
330
128k
  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
331
128k
  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
332
128k
  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
333
334
  // stage 1
335
128k
  __m256i x1[16];
336
128k
  x1[0] = input[15];
337
128k
  x1[1] = input[0];
338
128k
  x1[2] = input[13];
339
128k
  x1[3] = input[2];
340
128k
  x1[4] = input[11];
341
128k
  x1[5] = input[4];
342
128k
  x1[6] = input[9];
343
128k
  x1[7] = input[6];
344
128k
  x1[8] = input[7];
345
128k
  x1[9] = input[8];
346
128k
  x1[10] = input[5];
347
128k
  x1[11] = input[10];
348
128k
  x1[12] = input[3];
349
128k
  x1[13] = input[12];
350
128k
  x1[14] = input[1];
351
128k
  x1[15] = input[14];
352
353
  // stage 2
354
128k
  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
355
128k
                  INV_COS_BIT);
356
128k
  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
357
128k
                  INV_COS_BIT);
358
128k
  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
359
128k
                  INV_COS_BIT);
360
128k
  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
361
128k
                  INV_COS_BIT);
362
128k
  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
363
128k
                  INV_COS_BIT);
364
128k
  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
365
128k
                  INV_COS_BIT);
366
128k
  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
367
128k
                  INV_COS_BIT);
368
128k
  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
369
128k
                  INV_COS_BIT);
370
371
128k
  iadst16_stage3_avx2(x1);
372
128k
  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
373
128k
  iadst16_stage5_avx2(x1);
374
128k
  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
375
128k
  iadst16_stage7_avx2(x1);
376
128k
  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
377
128k
  iadst16_stage9_avx2(output, x1);
378
128k
}
379
380
330k
static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
381
330k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
382
330k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
383
384
  // stage 1
385
330k
  __m256i x1[16];
386
330k
  x1[1] = input[0];
387
330k
  x1[3] = input[2];
388
330k
  x1[5] = input[4];
389
330k
  x1[7] = input[6];
390
330k
  x1[8] = input[7];
391
330k
  x1[10] = input[5];
392
330k
  x1[12] = input[3];
393
330k
  x1[14] = input[1];
394
395
  // stage 2
396
330k
  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
397
330k
  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
398
330k
  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
399
330k
  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
400
330k
  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
401
330k
  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
402
330k
  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
403
330k
  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
404
405
330k
  iadst16_stage3_avx2(x1);
406
330k
  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
407
330k
  iadst16_stage5_avx2(x1);
408
330k
  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
409
330k
  iadst16_stage7_avx2(x1);
410
330k
  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
411
330k
  iadst16_stage9_avx2(output, x1);
412
330k
}
413
414
118k
static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
415
118k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
416
118k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
417
418
118k
  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
419
118k
  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
420
118k
  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
421
118k
  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
422
423
  // stage 1
424
118k
  __m256i x1[16];
425
118k
  x1[1] = input[0];
426
427
  // stage 2
428
118k
  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
429
430
  // stage 3
431
118k
  x1[8] = x1[0];
432
118k
  x1[9] = x1[1];
433
434
  // stage 4
435
118k
  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
436
118k
                  INV_COS_BIT);
437
438
  // stage 5
439
118k
  x1[4] = x1[0];
440
118k
  x1[5] = x1[1];
441
442
118k
  x1[12] = x1[8];
443
118k
  x1[13] = x1[9];
444
445
  // stage 6
446
118k
  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
447
118k
                  INV_COS_BIT);
448
118k
  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
449
118k
                  INV_COS_BIT);
450
451
  // stage 7
452
118k
  x1[2] = x1[0];
453
118k
  x1[3] = x1[1];
454
118k
  x1[6] = x1[4];
455
118k
  x1[7] = x1[5];
456
118k
  x1[10] = x1[8];
457
118k
  x1[11] = x1[9];
458
118k
  x1[14] = x1[12];
459
118k
  x1[15] = x1[13];
460
461
118k
  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
462
118k
  iadst16_stage9_avx2(output, x1);
463
118k
}
464
465
389k
static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
466
389k
  btf_16_adds_subs_avx2(&x[16], &x[17]);
467
389k
  btf_16_adds_subs_avx2(&x[19], &x[18]);
468
389k
  btf_16_adds_subs_avx2(&x[20], &x[21]);
469
389k
  btf_16_adds_subs_avx2(&x[23], &x[22]);
470
389k
  btf_16_adds_subs_avx2(&x[24], &x[25]);
471
389k
  btf_16_adds_subs_avx2(&x[27], &x[26]);
472
389k
  btf_16_adds_subs_avx2(&x[28], &x[29]);
473
389k
  btf_16_adds_subs_avx2(&x[31], &x[30]);
474
389k
}
475
476
static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
477
1.10M
                                             const __m256i _r, int8_t cos_bit) {
478
1.10M
  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
479
1.10M
  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
480
1.10M
  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
481
1.10M
  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
482
1.10M
  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
483
1.10M
  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
484
1.10M
  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
485
1.10M
  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
486
1.10M
  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
487
1.10M
  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
488
1.10M
}
489
490
static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
491
1.10M
                                             const __m256i _r, int8_t cos_bit) {
492
1.10M
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
493
1.10M
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
494
1.10M
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
495
1.10M
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
496
1.10M
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
497
1.10M
  btf_16_adds_subs_avx2(&x[16], &x[19]);
498
1.10M
  btf_16_adds_subs_avx2(&x[17], &x[18]);
499
1.10M
  btf_16_adds_subs_avx2(&x[23], &x[20]);
500
1.10M
  btf_16_adds_subs_avx2(&x[22], &x[21]);
501
1.10M
  btf_16_adds_subs_avx2(&x[24], &x[27]);
502
1.10M
  btf_16_adds_subs_avx2(&x[25], &x[26]);
503
1.10M
  btf_16_adds_subs_avx2(&x[31], &x[28]);
504
1.10M
  btf_16_adds_subs_avx2(&x[30], &x[29]);
505
1.10M
}
506
507
static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
508
1.10M
                                             const __m256i _r, int8_t cos_bit) {
509
1.10M
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
510
1.10M
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
511
1.10M
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
512
1.10M
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
513
1.10M
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
514
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
515
1.10M
  btf_16_adds_subs_avx2(&x[8], &x[11]);
516
1.10M
  btf_16_adds_subs_avx2(&x[9], &x[10]);
517
1.10M
  btf_16_adds_subs_avx2(&x[15], &x[12]);
518
1.10M
  btf_16_adds_subs_avx2(&x[14], &x[13]);
519
1.10M
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
520
1.10M
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
521
1.10M
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
522
1.10M
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
523
1.10M
}
524
525
static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
526
1.10M
                                      const __m256i _r, int8_t cos_bit) {
527
1.10M
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
528
1.10M
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
529
1.10M
  btf_16_adds_subs_avx2(&x[0], &x[7]);
530
1.10M
  btf_16_adds_subs_avx2(&x[1], &x[6]);
531
1.10M
  btf_16_adds_subs_avx2(&x[2], &x[5]);
532
1.10M
  btf_16_adds_subs_avx2(&x[3], &x[4]);
533
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
534
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
535
1.10M
  btf_16_adds_subs_avx2(&x[16], &x[23]);
536
1.10M
  btf_16_adds_subs_avx2(&x[17], &x[22]);
537
1.10M
  btf_16_adds_subs_avx2(&x[18], &x[21]);
538
1.10M
  btf_16_adds_subs_avx2(&x[19], &x[20]);
539
1.10M
  btf_16_adds_subs_avx2(&x[31], &x[24]);
540
1.10M
  btf_16_adds_subs_avx2(&x[30], &x[25]);
541
1.10M
  btf_16_adds_subs_avx2(&x[29], &x[26]);
542
1.10M
  btf_16_adds_subs_avx2(&x[28], &x[27]);
543
1.10M
}
544
545
static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
546
1.10M
                                      const __m256i _r, int8_t cos_bit) {
547
1.10M
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
548
1.10M
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
549
1.10M
  btf_16_adds_subs_avx2(&x[0], &x[15]);
550
1.10M
  btf_16_adds_subs_avx2(&x[1], &x[14]);
551
1.10M
  btf_16_adds_subs_avx2(&x[2], &x[13]);
552
1.10M
  btf_16_adds_subs_avx2(&x[3], &x[12]);
553
1.10M
  btf_16_adds_subs_avx2(&x[4], &x[11]);
554
1.10M
  btf_16_adds_subs_avx2(&x[5], &x[10]);
555
1.10M
  btf_16_adds_subs_avx2(&x[6], &x[9]);
556
1.10M
  btf_16_adds_subs_avx2(&x[7], &x[8]);
557
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
558
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
559
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
560
1.10M
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
561
1.10M
}
562
563
1.10M
static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
564
1.10M
  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
565
1.10M
  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
566
1.10M
  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
567
1.10M
  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
568
1.10M
  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
569
1.10M
  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
570
1.10M
  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
571
1.10M
  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
572
1.10M
  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
573
1.10M
  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
574
1.10M
  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
575
1.10M
  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
576
1.10M
  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
577
1.10M
  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
578
1.10M
  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
579
1.10M
  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
580
1.10M
}
581
582
544k
static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
583
544k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
584
585
  // stage 1
586
544k
  __m256i x[2];
587
544k
  x[0] = input[0];
588
589
  // stage 2
590
  // stage 3
591
  // stage 4
592
  // stage 5
593
544k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
594
595
  // stage 6
596
  // stage 7
597
  // stage 8
598
  // stage 9
599
544k
  output[0] = x[0];
600
544k
  output[31] = x[0];
601
544k
  output[1] = x[1];
602
544k
  output[30] = x[1];
603
544k
  output[2] = x[1];
604
544k
  output[29] = x[1];
605
544k
  output[3] = x[0];
606
544k
  output[28] = x[0];
607
544k
  output[4] = x[0];
608
544k
  output[27] = x[0];
609
544k
  output[5] = x[1];
610
544k
  output[26] = x[1];
611
544k
  output[6] = x[1];
612
544k
  output[25] = x[1];
613
544k
  output[7] = x[0];
614
544k
  output[24] = x[0];
615
544k
  output[8] = x[0];
616
544k
  output[23] = x[0];
617
544k
  output[9] = x[1];
618
544k
  output[22] = x[1];
619
544k
  output[10] = x[1];
620
544k
  output[21] = x[1];
621
544k
  output[11] = x[0];
622
544k
  output[20] = x[0];
623
544k
  output[12] = x[0];
624
544k
  output[19] = x[0];
625
544k
  output[13] = x[1];
626
544k
  output[18] = x[1];
627
544k
  output[14] = x[1];
628
544k
  output[17] = x[1];
629
544k
  output[15] = x[0];
630
544k
  output[16] = x[0];
631
544k
}
632
633
720k
static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
634
720k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
635
720k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
636
637
  // stage 1
638
720k
  __m256i x[32];
639
720k
  x[0] = input[0];
640
720k
  x[4] = input[4];
641
720k
  x[8] = input[2];
642
720k
  x[12] = input[6];
643
720k
  x[16] = input[1];
644
720k
  x[20] = input[5];
645
720k
  x[24] = input[3];
646
720k
  x[28] = input[7];
647
648
  // stage 2
649
720k
  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
650
720k
  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
651
720k
  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
652
720k
  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
653
654
  // stage 3
655
720k
  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
656
720k
  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
657
720k
  x[17] = x[16];
658
720k
  x[18] = x[19];
659
720k
  x[21] = x[20];
660
720k
  x[22] = x[23];
661
720k
  x[25] = x[24];
662
720k
  x[26] = x[27];
663
720k
  x[29] = x[28];
664
720k
  x[30] = x[31];
665
666
  // stage 4
667
720k
  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
668
720k
  x[9] = x[8];
669
720k
  x[10] = x[11];
670
720k
  x[13] = x[12];
671
720k
  x[14] = x[15];
672
720k
  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
673
674
  // stage 5
675
720k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
676
720k
  x[5] = x[4];
677
720k
  x[6] = x[7];
678
720k
  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
679
  // stage 6
680
720k
  x[3] = x[0];
681
720k
  x[2] = x[1];
682
720k
  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
683
684
720k
  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
685
720k
  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
686
720k
  idct32_stage9_avx2(output, x);
687
720k
}
688
689
214k
static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
690
214k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
691
214k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
692
693
  // stage 1
694
214k
  __m256i x[32];
695
214k
  x[0] = input[0];
696
214k
  x[2] = input[8];
697
214k
  x[4] = input[4];
698
214k
  x[6] = input[12];
699
214k
  x[8] = input[2];
700
214k
  x[10] = input[10];
701
214k
  x[12] = input[6];
702
214k
  x[14] = input[14];
703
214k
  x[16] = input[1];
704
214k
  x[18] = input[9];
705
214k
  x[20] = input[5];
706
214k
  x[22] = input[13];
707
214k
  x[24] = input[3];
708
214k
  x[26] = input[11];
709
214k
  x[28] = input[7];
710
214k
  x[30] = input[15];
711
712
  // stage 2
713
214k
  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
714
214k
  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
715
214k
  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
716
214k
  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
717
214k
  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
718
214k
  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
719
214k
  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
720
214k
  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
721
722
  // stage 3
723
214k
  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
724
214k
  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
725
214k
  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
726
214k
  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
727
214k
  idct32_high16_stage3_avx2(x);
728
729
  // stage 4
730
214k
  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
731
214k
  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
732
214k
  btf_16_adds_subs_avx2(&x[8], &x[9]);
733
214k
  btf_16_adds_subs_avx2(&x[11], &x[10]);
734
214k
  btf_16_adds_subs_avx2(&x[12], &x[13]);
735
214k
  btf_16_adds_subs_avx2(&x[15], &x[14]);
736
214k
  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
737
738
  // stage 5
739
214k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
740
214k
  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
741
214k
  btf_16_adds_subs_avx2(&x[4], &x[5]);
742
214k
  btf_16_adds_subs_avx2(&x[7], &x[6]);
743
214k
  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
744
745
214k
  btf_16_adds_subs_avx2(&x[0], &x[3]);
746
214k
  btf_16_adds_subs_avx2(&x[1], &x[2]);
747
214k
  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
748
749
214k
  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
750
214k
  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
751
214k
  idct32_stage9_avx2(output, x);
752
214k
}
753
754
175k
static void idct32_avx2(const __m256i *input, __m256i *output) {
755
175k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
756
175k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
757
758
175k
  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
759
175k
  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
760
175k
  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
761
175k
  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
762
175k
  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
763
175k
  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
764
175k
  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
765
175k
  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
766
175k
  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
767
175k
  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
768
175k
  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
769
175k
  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
770
175k
  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
771
175k
  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
772
175k
  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
773
175k
  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
774
175k
  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
775
175k
  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
776
175k
  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
777
175k
  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
778
175k
  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
779
175k
  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
780
175k
  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
781
175k
  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
782
175k
  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
783
175k
  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
784
175k
  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
785
175k
  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
786
175k
  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
787
175k
  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
788
175k
  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
789
175k
  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
790
791
  // stage 1
792
175k
  __m256i x1[32];
793
175k
  x1[0] = input[0];
794
175k
  x1[1] = input[16];
795
175k
  x1[2] = input[8];
796
175k
  x1[3] = input[24];
797
175k
  x1[4] = input[4];
798
175k
  x1[5] = input[20];
799
175k
  x1[6] = input[12];
800
175k
  x1[7] = input[28];
801
175k
  x1[8] = input[2];
802
175k
  x1[9] = input[18];
803
175k
  x1[10] = input[10];
804
175k
  x1[11] = input[26];
805
175k
  x1[12] = input[6];
806
175k
  x1[13] = input[22];
807
175k
  x1[14] = input[14];
808
175k
  x1[15] = input[30];
809
175k
  x1[16] = input[1];
810
175k
  x1[17] = input[17];
811
175k
  x1[18] = input[9];
812
175k
  x1[19] = input[25];
813
175k
  x1[20] = input[5];
814
175k
  x1[21] = input[21];
815
175k
  x1[22] = input[13];
816
175k
  x1[23] = input[29];
817
175k
  x1[24] = input[3];
818
175k
  x1[25] = input[19];
819
175k
  x1[26] = input[11];
820
175k
  x1[27] = input[27];
821
175k
  x1[28] = input[7];
822
175k
  x1[29] = input[23];
823
175k
  x1[30] = input[15];
824
175k
  x1[31] = input[31];
825
826
  // stage 2
827
175k
  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
828
175k
                  INV_COS_BIT);
829
175k
  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
830
175k
                  INV_COS_BIT);
831
175k
  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
832
175k
                  INV_COS_BIT);
833
175k
  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
834
175k
                  INV_COS_BIT);
835
175k
  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
836
175k
                  INV_COS_BIT);
837
175k
  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
838
175k
                  INV_COS_BIT);
839
175k
  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
840
175k
                  INV_COS_BIT);
841
175k
  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
842
175k
                  INV_COS_BIT);
843
844
  // stage 3
845
175k
  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
846
175k
                  INV_COS_BIT);
847
175k
  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
848
175k
                  INV_COS_BIT);
849
175k
  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
850
175k
                  INV_COS_BIT);
851
175k
  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
852
175k
                  INV_COS_BIT);
853
175k
  idct32_high16_stage3_avx2(x1);
854
855
  // stage 4
856
175k
  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
857
175k
                  INV_COS_BIT);
858
175k
  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
859
175k
                  INV_COS_BIT);
860
175k
  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
861
175k
  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
862
175k
  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
863
175k
  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
864
175k
  idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
865
866
  // stage 5
867
175k
  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
868
175k
                  INV_COS_BIT);
869
175k
  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
870
175k
                  INV_COS_BIT);
871
175k
  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
872
175k
  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
873
175k
  idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
874
875
  // stage 6
876
175k
  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
877
175k
  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
878
175k
  idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
879
880
175k
  idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
881
175k
  idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
882
175k
  idct32_stage9_avx2(output, x1);
883
175k
}
884
885
static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
886
263k
                                             const __m256i _r, int8_t cos_bit) {
887
263k
  (void)cos_bit;
888
263k
  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
889
263k
  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
890
263k
  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
891
263k
  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
892
263k
  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
893
263k
  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
894
263k
  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
895
263k
  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
896
263k
  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
897
263k
  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
898
263k
  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
899
263k
  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
900
263k
  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
901
263k
  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
902
263k
  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
903
263k
  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
904
263k
  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
905
263k
  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
906
263k
  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
907
263k
  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
908
263k
}
909
910
static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
911
263k
                                             const __m256i _r, int8_t cos_bit) {
912
263k
  (void)cos_bit;
913
263k
  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
914
263k
  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
915
263k
  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
916
263k
  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
917
263k
  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
918
263k
  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
919
263k
  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
920
263k
  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
921
263k
  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
922
263k
  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
923
263k
  btf_16_adds_subs_avx2(&x[32], &x[35]);
924
263k
  btf_16_adds_subs_avx2(&x[33], &x[34]);
925
263k
  btf_16_adds_subs_avx2(&x[39], &x[36]);
926
263k
  btf_16_adds_subs_avx2(&x[38], &x[37]);
927
263k
  btf_16_adds_subs_avx2(&x[40], &x[43]);
928
263k
  btf_16_adds_subs_avx2(&x[41], &x[42]);
929
263k
  btf_16_adds_subs_avx2(&x[47], &x[44]);
930
263k
  btf_16_adds_subs_avx2(&x[46], &x[45]);
931
263k
  btf_16_adds_subs_avx2(&x[48], &x[51]);
932
263k
  btf_16_adds_subs_avx2(&x[49], &x[50]);
933
263k
  btf_16_adds_subs_avx2(&x[55], &x[52]);
934
263k
  btf_16_adds_subs_avx2(&x[54], &x[53]);
935
263k
  btf_16_adds_subs_avx2(&x[56], &x[59]);
936
263k
  btf_16_adds_subs_avx2(&x[57], &x[58]);
937
263k
  btf_16_adds_subs_avx2(&x[63], &x[60]);
938
263k
  btf_16_adds_subs_avx2(&x[62], &x[61]);
939
263k
}
940
941
static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
942
506k
                                             const __m256i _r, int8_t cos_bit) {
943
506k
  (void)cos_bit;
944
506k
  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
945
506k
  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
946
506k
  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
947
506k
  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
948
506k
  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
949
506k
  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
950
506k
  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
951
506k
  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
952
506k
  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
953
506k
  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
954
506k
  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
955
506k
  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
956
506k
  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
957
506k
  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
958
506k
}
959
960
static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
961
263k
                                             const __m256i _r, int8_t cos_bit) {
962
263k
  btf_16_adds_subs_avx2(&x[16], &x[19]);
963
263k
  btf_16_adds_subs_avx2(&x[17], &x[18]);
964
263k
  btf_16_adds_subs_avx2(&x[23], &x[20]);
965
263k
  btf_16_adds_subs_avx2(&x[22], &x[21]);
966
263k
  btf_16_adds_subs_avx2(&x[24], &x[27]);
967
263k
  btf_16_adds_subs_avx2(&x[25], &x[26]);
968
263k
  btf_16_adds_subs_avx2(&x[31], &x[28]);
969
263k
  btf_16_adds_subs_avx2(&x[30], &x[29]);
970
263k
  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
971
263k
}
972
973
static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
974
506k
                                             const __m256i _r, int8_t cos_bit) {
975
506k
  (void)cos_bit;
976
506k
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
977
506k
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
978
506k
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
979
506k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
980
506k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
981
506k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
982
506k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
983
506k
  btf_16_adds_subs_avx2(&x[32], &x[39]);
984
506k
  btf_16_adds_subs_avx2(&x[33], &x[38]);
985
506k
  btf_16_adds_subs_avx2(&x[34], &x[37]);
986
506k
  btf_16_adds_subs_avx2(&x[35], &x[36]);
987
506k
  btf_16_adds_subs_avx2(&x[47], &x[40]);
988
506k
  btf_16_adds_subs_avx2(&x[46], &x[41]);
989
506k
  btf_16_adds_subs_avx2(&x[45], &x[42]);
990
506k
  btf_16_adds_subs_avx2(&x[44], &x[43]);
991
506k
  btf_16_adds_subs_avx2(&x[48], &x[55]);
992
506k
  btf_16_adds_subs_avx2(&x[49], &x[54]);
993
506k
  btf_16_adds_subs_avx2(&x[50], &x[53]);
994
506k
  btf_16_adds_subs_avx2(&x[51], &x[52]);
995
506k
  btf_16_adds_subs_avx2(&x[63], &x[56]);
996
506k
  btf_16_adds_subs_avx2(&x[62], &x[57]);
997
506k
  btf_16_adds_subs_avx2(&x[61], &x[58]);
998
506k
  btf_16_adds_subs_avx2(&x[60], &x[59]);
999
506k
}
1000
1001
static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
1002
506k
                                             const __m256i _r, int8_t cos_bit) {
1003
506k
  (void)cos_bit;
1004
506k
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1005
506k
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1006
506k
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1007
506k
  btf_16_adds_subs_avx2(&x[16], &x[23]);
1008
506k
  btf_16_adds_subs_avx2(&x[17], &x[22]);
1009
506k
  btf_16_adds_subs_avx2(&x[18], &x[21]);
1010
506k
  btf_16_adds_subs_avx2(&x[19], &x[20]);
1011
506k
  btf_16_adds_subs_avx2(&x[31], &x[24]);
1012
506k
  btf_16_adds_subs_avx2(&x[30], &x[25]);
1013
506k
  btf_16_adds_subs_avx2(&x[29], &x[26]);
1014
506k
  btf_16_adds_subs_avx2(&x[28], &x[27]);
1015
506k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
1016
506k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
1017
506k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
1018
506k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
1019
506k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
1020
506k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
1021
506k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
1022
506k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
1023
506k
}
1024
1025
static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
1026
506k
                                      const __m256i _r, int8_t cos_bit) {
1027
506k
  (void)cos_bit;
1028
506k
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1029
506k
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1030
506k
  btf_16_adds_subs_avx2(&x[0], &x[15]);
1031
506k
  btf_16_adds_subs_avx2(&x[1], &x[14]);
1032
506k
  btf_16_adds_subs_avx2(&x[2], &x[13]);
1033
506k
  btf_16_adds_subs_avx2(&x[3], &x[12]);
1034
506k
  btf_16_adds_subs_avx2(&x[4], &x[11]);
1035
506k
  btf_16_adds_subs_avx2(&x[5], &x[10]);
1036
506k
  btf_16_adds_subs_avx2(&x[6], &x[9]);
1037
506k
  btf_16_adds_subs_avx2(&x[7], &x[8]);
1038
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
1039
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
1040
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
1041
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
1042
506k
  btf_16_adds_subs_avx2(&x[32], &x[47]);
1043
506k
  btf_16_adds_subs_avx2(&x[33], &x[46]);
1044
506k
  btf_16_adds_subs_avx2(&x[34], &x[45]);
1045
506k
  btf_16_adds_subs_avx2(&x[35], &x[44]);
1046
506k
  btf_16_adds_subs_avx2(&x[36], &x[43]);
1047
506k
  btf_16_adds_subs_avx2(&x[37], &x[42]);
1048
506k
  btf_16_adds_subs_avx2(&x[38], &x[41]);
1049
506k
  btf_16_adds_subs_avx2(&x[39], &x[40]);
1050
506k
  btf_16_adds_subs_avx2(&x[63], &x[48]);
1051
506k
  btf_16_adds_subs_avx2(&x[62], &x[49]);
1052
506k
  btf_16_adds_subs_avx2(&x[61], &x[50]);
1053
506k
  btf_16_adds_subs_avx2(&x[60], &x[51]);
1054
506k
  btf_16_adds_subs_avx2(&x[59], &x[52]);
1055
506k
  btf_16_adds_subs_avx2(&x[58], &x[53]);
1056
506k
  btf_16_adds_subs_avx2(&x[57], &x[54]);
1057
506k
  btf_16_adds_subs_avx2(&x[56], &x[55]);
1058
506k
}
1059
1060
static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
1061
506k
                                       const __m256i _r, int8_t cos_bit) {
1062
506k
  (void)cos_bit;
1063
506k
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1064
506k
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1065
506k
  btf_16_adds_subs_avx2(&x[0], &x[31]);
1066
506k
  btf_16_adds_subs_avx2(&x[1], &x[30]);
1067
506k
  btf_16_adds_subs_avx2(&x[2], &x[29]);
1068
506k
  btf_16_adds_subs_avx2(&x[3], &x[28]);
1069
506k
  btf_16_adds_subs_avx2(&x[4], &x[27]);
1070
506k
  btf_16_adds_subs_avx2(&x[5], &x[26]);
1071
506k
  btf_16_adds_subs_avx2(&x[6], &x[25]);
1072
506k
  btf_16_adds_subs_avx2(&x[7], &x[24]);
1073
506k
  btf_16_adds_subs_avx2(&x[8], &x[23]);
1074
506k
  btf_16_adds_subs_avx2(&x[9], &x[22]);
1075
506k
  btf_16_adds_subs_avx2(&x[10], &x[21]);
1076
506k
  btf_16_adds_subs_avx2(&x[11], &x[20]);
1077
506k
  btf_16_adds_subs_avx2(&x[12], &x[19]);
1078
506k
  btf_16_adds_subs_avx2(&x[13], &x[18]);
1079
506k
  btf_16_adds_subs_avx2(&x[14], &x[17]);
1080
506k
  btf_16_adds_subs_avx2(&x[15], &x[16]);
1081
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
1082
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
1083
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
1084
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
1085
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
1086
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
1087
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
1088
506k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
1089
506k
}
1090
1091
506k
static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
1092
506k
  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
1093
506k
  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
1094
506k
  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
1095
506k
  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
1096
506k
  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
1097
506k
  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
1098
506k
  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
1099
506k
  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
1100
506k
  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
1101
506k
  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
1102
506k
  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
1103
506k
  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
1104
506k
  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
1105
506k
  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
1106
506k
  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
1107
506k
  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
1108
506k
  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
1109
506k
  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
1110
506k
  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
1111
506k
  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
1112
506k
  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
1113
506k
  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
1114
506k
  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
1115
506k
  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
1116
506k
  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
1117
506k
  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
1118
506k
  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
1119
506k
  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
1120
506k
  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
1121
506k
  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
1122
506k
  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
1123
506k
  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
1124
506k
}
1125
1126
170k
static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
1127
170k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
1128
1129
  // stage 1
1130
170k
  __m256i x[32];
1131
170k
  x[0] = input[0];
1132
1133
  // stage 2
1134
  // stage 3
1135
  // stage 4
1136
  // stage 5
1137
  // stage 6
1138
170k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1139
1140
  // stage 7
1141
  // stage 8
1142
  // stage 9
1143
  // stage 10
1144
  // stage 11
1145
170k
  output[0] = x[0];
1146
170k
  output[63] = x[0];
1147
170k
  output[1] = x[1];
1148
170k
  output[62] = x[1];
1149
170k
  output[2] = x[1];
1150
170k
  output[61] = x[1];
1151
170k
  output[3] = x[0];
1152
170k
  output[60] = x[0];
1153
170k
  output[4] = x[0];
1154
170k
  output[59] = x[0];
1155
170k
  output[5] = x[1];
1156
170k
  output[58] = x[1];
1157
170k
  output[6] = x[1];
1158
170k
  output[57] = x[1];
1159
170k
  output[7] = x[0];
1160
170k
  output[56] = x[0];
1161
170k
  output[8] = x[0];
1162
170k
  output[55] = x[0];
1163
170k
  output[9] = x[1];
1164
170k
  output[54] = x[1];
1165
170k
  output[10] = x[1];
1166
170k
  output[53] = x[1];
1167
170k
  output[11] = x[0];
1168
170k
  output[52] = x[0];
1169
170k
  output[12] = x[0];
1170
170k
  output[51] = x[0];
1171
170k
  output[13] = x[1];
1172
170k
  output[50] = x[1];
1173
170k
  output[14] = x[1];
1174
170k
  output[49] = x[1];
1175
170k
  output[15] = x[0];
1176
170k
  output[48] = x[0];
1177
170k
  output[16] = x[0];
1178
170k
  output[47] = x[0];
1179
170k
  output[17] = x[1];
1180
170k
  output[46] = x[1];
1181
170k
  output[18] = x[1];
1182
170k
  output[45] = x[1];
1183
170k
  output[19] = x[0];
1184
170k
  output[44] = x[0];
1185
170k
  output[20] = x[0];
1186
170k
  output[43] = x[0];
1187
170k
  output[21] = x[1];
1188
170k
  output[42] = x[1];
1189
170k
  output[22] = x[1];
1190
170k
  output[41] = x[1];
1191
170k
  output[23] = x[0];
1192
170k
  output[40] = x[0];
1193
170k
  output[24] = x[0];
1194
170k
  output[39] = x[0];
1195
170k
  output[25] = x[1];
1196
170k
  output[38] = x[1];
1197
170k
  output[26] = x[1];
1198
170k
  output[37] = x[1];
1199
170k
  output[27] = x[0];
1200
170k
  output[36] = x[0];
1201
170k
  output[28] = x[0];
1202
170k
  output[35] = x[0];
1203
170k
  output[29] = x[1];
1204
170k
  output[34] = x[1];
1205
170k
  output[30] = x[1];
1206
170k
  output[33] = x[1];
1207
170k
  output[31] = x[0];
1208
170k
  output[32] = x[0];
1209
170k
}
1210
1211
242k
static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
1212
242k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
1213
242k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1214
242k
  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
1215
242k
  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
1216
242k
  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
1217
242k
  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
1218
242k
  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
1219
242k
  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
1220
242k
  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
1221
242k
  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
1222
242k
  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
1223
242k
  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
1224
242k
  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
1225
242k
  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
1226
242k
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1227
242k
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1228
242k
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1229
242k
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1230
1231
  // stage 1
1232
242k
  __m256i x[64];
1233
242k
  x[0] = input[0];
1234
242k
  x[8] = input[4];
1235
242k
  x[16] = input[2];
1236
242k
  x[24] = input[6];
1237
242k
  x[32] = input[1];
1238
242k
  x[40] = input[5];
1239
242k
  x[48] = input[3];
1240
242k
  x[56] = input[7];
1241
1242
  // stage 2
1243
242k
  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1244
242k
  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1245
242k
  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1246
242k
  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1247
1248
  // stage 3
1249
242k
  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1250
242k
  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1251
242k
  x[33] = x[32];
1252
242k
  x[38] = x[39];
1253
242k
  x[41] = x[40];
1254
242k
  x[46] = x[47];
1255
242k
  x[49] = x[48];
1256
242k
  x[54] = x[55];
1257
242k
  x[57] = x[56];
1258
242k
  x[62] = x[63];
1259
1260
  // stage 4
1261
242k
  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1262
242k
  x[17] = x[16];
1263
242k
  x[22] = x[23];
1264
242k
  x[25] = x[24];
1265
242k
  x[30] = x[31];
1266
242k
  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
1267
242k
                  INV_COS_BIT);
1268
242k
  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
1269
242k
                  INV_COS_BIT);
1270
242k
  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
1271
242k
                  INV_COS_BIT);
1272
242k
  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
1273
242k
                  INV_COS_BIT);
1274
1275
  // stage 5
1276
242k
  x[9] = x[8];
1277
242k
  x[14] = x[15];
1278
242k
  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
1279
242k
                  INV_COS_BIT);
1280
242k
  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
1281
242k
                  INV_COS_BIT);
1282
242k
  x[35] = x[32];
1283
242k
  x[34] = x[33];
1284
242k
  x[36] = x[39];
1285
242k
  x[37] = x[38];
1286
242k
  x[43] = x[40];
1287
242k
  x[42] = x[41];
1288
242k
  x[44] = x[47];
1289
242k
  x[45] = x[46];
1290
242k
  x[51] = x[48];
1291
242k
  x[50] = x[49];
1292
242k
  x[52] = x[55];
1293
242k
  x[53] = x[54];
1294
242k
  x[59] = x[56];
1295
242k
  x[58] = x[57];
1296
242k
  x[60] = x[63];
1297
242k
  x[61] = x[62];
1298
1299
  // stage 6
1300
242k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1301
242k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1302
242k
  x[19] = x[16];
1303
242k
  x[18] = x[17];
1304
242k
  x[20] = x[23];
1305
242k
  x[21] = x[22];
1306
242k
  x[27] = x[24];
1307
242k
  x[26] = x[25];
1308
242k
  x[28] = x[31];
1309
242k
  x[29] = x[30];
1310
242k
  idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
1311
1312
  // stage 7
1313
242k
  x[3] = x[0];
1314
242k
  x[2] = x[1];
1315
242k
  x[11] = x[8];
1316
242k
  x[10] = x[9];
1317
242k
  x[12] = x[15];
1318
242k
  x[13] = x[14];
1319
242k
  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1320
1321
  // stage 8
1322
242k
  x[7] = x[0];
1323
242k
  x[6] = x[1];
1324
242k
  x[5] = x[2];
1325
242k
  x[4] = x[3];
1326
242k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1327
242k
                  INV_COS_BIT);
1328
242k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1329
242k
                  INV_COS_BIT);
1330
242k
  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1331
1332
242k
  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1333
242k
  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1334
242k
  idct64_stage11_avx2(output, x);
1335
242k
}
1336
1337
138k
static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
1338
138k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
1339
138k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1340
1341
138k
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1342
138k
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1343
138k
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1344
138k
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1345
138k
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1346
1347
  // stage 1
1348
138k
  __m256i x[64];
1349
138k
  x[0] = input[0];
1350
138k
  x[4] = input[8];
1351
138k
  x[8] = input[4];
1352
138k
  x[12] = input[12];
1353
138k
  x[16] = input[2];
1354
138k
  x[20] = input[10];
1355
138k
  x[24] = input[6];
1356
138k
  x[28] = input[14];
1357
138k
  x[32] = input[1];
1358
138k
  x[36] = input[9];
1359
138k
  x[40] = input[5];
1360
138k
  x[44] = input[13];
1361
138k
  x[48] = input[3];
1362
138k
  x[52] = input[11];
1363
138k
  x[56] = input[7];
1364
138k
  x[60] = input[15];
1365
1366
  // stage 2
1367
138k
  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1368
138k
  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1369
138k
  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1370
138k
  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1371
138k
  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1372
138k
  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1373
138k
  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1374
138k
  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1375
1376
  // stage 3
1377
138k
  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1378
138k
  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1379
138k
  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1380
138k
  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1381
138k
  x[33] = x[32];
1382
138k
  x[34] = x[35];
1383
138k
  x[37] = x[36];
1384
138k
  x[38] = x[39];
1385
138k
  x[41] = x[40];
1386
138k
  x[42] = x[43];
1387
138k
  x[45] = x[44];
1388
138k
  x[46] = x[47];
1389
138k
  x[49] = x[48];
1390
138k
  x[50] = x[51];
1391
138k
  x[53] = x[52];
1392
138k
  x[54] = x[55];
1393
138k
  x[57] = x[56];
1394
138k
  x[58] = x[59];
1395
138k
  x[61] = x[60];
1396
138k
  x[62] = x[63];
1397
1398
  // stage 4
1399
138k
  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1400
138k
  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1401
138k
  x[17] = x[16];
1402
138k
  x[18] = x[19];
1403
138k
  x[21] = x[20];
1404
138k
  x[22] = x[23];
1405
138k
  x[25] = x[24];
1406
138k
  x[26] = x[27];
1407
138k
  x[29] = x[28];
1408
138k
  x[30] = x[31];
1409
138k
  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
1410
1411
  // stage 5
1412
138k
  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1413
138k
  x[9] = x[8];
1414
138k
  x[10] = x[11];
1415
138k
  x[13] = x[12];
1416
138k
  x[14] = x[15];
1417
138k
  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
1418
1419
  // stage 6
1420
138k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1421
138k
  x[5] = x[4];
1422
138k
  x[6] = x[7];
1423
138k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1424
138k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
1425
138k
                  INV_COS_BIT);
1426
138k
  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
1427
1428
  // stage 7
1429
138k
  x[3] = x[0];
1430
138k
  x[2] = x[1];
1431
138k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
1432
138k
  btf_16_adds_subs_avx2(&x[8], &x[11]);
1433
138k
  btf_16_adds_subs_avx2(&x[9], &x[10]);
1434
138k
  btf_16_adds_subs_avx2(&x[15], &x[12]);
1435
138k
  btf_16_adds_subs_avx2(&x[14], &x[13]);
1436
138k
  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1437
1438
  // stage 8
1439
138k
  btf_16_adds_subs_avx2(&x[0], &x[7]);
1440
138k
  btf_16_adds_subs_avx2(&x[1], &x[6]);
1441
138k
  btf_16_adds_subs_avx2(&x[2], &x[5]);
1442
138k
  btf_16_adds_subs_avx2(&x[3], &x[4]);
1443
138k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1444
138k
                  INV_COS_BIT);
1445
138k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1446
138k
                  INV_COS_BIT);
1447
138k
  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1448
1449
138k
  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1450
138k
  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1451
138k
  idct64_stage11_avx2(output, x);
1452
138k
}
1453
1454
125k
static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
1455
125k
  const int32_t *cospi = cospi_arr(INV_COS_BIT);
1456
125k
  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1457
1458
125k
  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1459
125k
  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1460
125k
  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1461
125k
  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1462
125k
  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1463
1464
  // stage 1
1465
125k
  __m256i x[64];
1466
125k
  x[0] = input[0];
1467
125k
  x[2] = input[16];
1468
125k
  x[4] = input[8];
1469
125k
  x[6] = input[24];
1470
125k
  x[8] = input[4];
1471
125k
  x[10] = input[20];
1472
125k
  x[12] = input[12];
1473
125k
  x[14] = input[28];
1474
125k
  x[16] = input[2];
1475
125k
  x[18] = input[18];
1476
125k
  x[20] = input[10];
1477
125k
  x[22] = input[26];
1478
125k
  x[24] = input[6];
1479
125k
  x[26] = input[22];
1480
125k
  x[28] = input[14];
1481
125k
  x[30] = input[30];
1482
125k
  x[32] = input[1];
1483
125k
  x[34] = input[17];
1484
125k
  x[36] = input[9];
1485
125k
  x[38] = input[25];
1486
125k
  x[40] = input[5];
1487
125k
  x[42] = input[21];
1488
125k
  x[44] = input[13];
1489
125k
  x[46] = input[29];
1490
125k
  x[48] = input[3];
1491
125k
  x[50] = input[19];
1492
125k
  x[52] = input[11];
1493
125k
  x[54] = input[27];
1494
125k
  x[56] = input[7];
1495
125k
  x[58] = input[23];
1496
125k
  x[60] = input[15];
1497
125k
  x[62] = input[31];
1498
1499
  // stage 2
1500
125k
  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1501
125k
  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
1502
125k
  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
1503
125k
  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1504
125k
  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1505
125k
  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
1506
125k
  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
1507
125k
  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1508
125k
  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1509
125k
  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
1510
125k
  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
1511
125k
  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1512
125k
  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1513
125k
  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
1514
125k
  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
1515
125k
  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1516
1517
  // stage 3
1518
125k
  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1519
125k
  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
1520
125k
  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
1521
125k
  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1522
125k
  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1523
125k
  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
1524
125k
  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
1525
125k
  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1526
125k
  btf_16_adds_subs_avx2(&x[32], &x[33]);
1527
125k
  btf_16_adds_subs_avx2(&x[35], &x[34]);
1528
125k
  btf_16_adds_subs_avx2(&x[36], &x[37]);
1529
125k
  btf_16_adds_subs_avx2(&x[39], &x[38]);
1530
125k
  btf_16_adds_subs_avx2(&x[40], &x[41]);
1531
125k
  btf_16_adds_subs_avx2(&x[43], &x[42]);
1532
125k
  btf_16_adds_subs_avx2(&x[44], &x[45]);
1533
125k
  btf_16_adds_subs_avx2(&x[47], &x[46]);
1534
125k
  btf_16_adds_subs_avx2(&x[48], &x[49]);
1535
125k
  btf_16_adds_subs_avx2(&x[51], &x[50]);
1536
125k
  btf_16_adds_subs_avx2(&x[52], &x[53]);
1537
125k
  btf_16_adds_subs_avx2(&x[55], &x[54]);
1538
125k
  btf_16_adds_subs_avx2(&x[56], &x[57]);
1539
125k
  btf_16_adds_subs_avx2(&x[59], &x[58]);
1540
125k
  btf_16_adds_subs_avx2(&x[60], &x[61]);
1541
125k
  btf_16_adds_subs_avx2(&x[63], &x[62]);
1542
1543
  // stage 4
1544
125k
  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1545
125k
  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
1546
125k
  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
1547
125k
  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1548
125k
  btf_16_adds_subs_avx2(&x[16], &x[17]);
1549
125k
  btf_16_adds_subs_avx2(&x[19], &x[18]);
1550
125k
  btf_16_adds_subs_avx2(&x[20], &x[21]);
1551
125k
  btf_16_adds_subs_avx2(&x[23], &x[22]);
1552
125k
  btf_16_adds_subs_avx2(&x[24], &x[25]);
1553
125k
  btf_16_adds_subs_avx2(&x[27], &x[26]);
1554
125k
  btf_16_adds_subs_avx2(&x[28], &x[29]);
1555
125k
  btf_16_adds_subs_avx2(&x[31], &x[30]);
1556
125k
  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
1557
1558
  // stage 5
1559
125k
  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1560
125k
  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
1561
125k
  btf_16_adds_subs_avx2(&x[8], &x[9]);
1562
125k
  btf_16_adds_subs_avx2(&x[11], &x[10]);
1563
125k
  btf_16_adds_subs_avx2(&x[12], &x[13]);
1564
125k
  btf_16_adds_subs_avx2(&x[15], &x[14]);
1565
125k
  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
1566
1567
  // stage 6
1568
125k
  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1569
125k
  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
1570
125k
  btf_16_adds_subs_avx2(&x[4], &x[5]);
1571
125k
  btf_16_adds_subs_avx2(&x[7], &x[6]);
1572
125k
  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1573
125k
  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
1574
125k
                  INV_COS_BIT);
1575
125k
  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
1576
1577
  // stage 7
1578
125k
  btf_16_adds_subs_avx2(&x[0], &x[3]);
1579
125k
  btf_16_adds_subs_avx2(&x[1], &x[2]);
1580
125k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
1581
125k
  btf_16_adds_subs_avx2(&x[8], &x[11]);
1582
125k
  btf_16_adds_subs_avx2(&x[9], &x[10]);
1583
125k
  btf_16_adds_subs_avx2(&x[15], &x[12]);
1584
125k
  btf_16_adds_subs_avx2(&x[14], &x[13]);
1585
125k
  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1586
1587
  // stage 8
1588
125k
  btf_16_adds_subs_avx2(&x[0], &x[7]);
1589
125k
  btf_16_adds_subs_avx2(&x[1], &x[6]);
1590
125k
  btf_16_adds_subs_avx2(&x[2], &x[5]);
1591
125k
  btf_16_adds_subs_avx2(&x[3], &x[4]);
1592
125k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1593
125k
                  INV_COS_BIT);
1594
125k
  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1595
125k
                  INV_COS_BIT);
1596
125k
  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1597
1598
  // stage 9~11
1599
125k
  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1600
125k
  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1601
125k
  idct64_stage11_avx2(output, x);
1602
125k
}
1603
1604
typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
1605
1606
// 1D functions process 16 pixels at one time.
1607
static const transform_1d_avx2
1608
    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
1609
      {
1610
          { NULL, NULL, NULL, NULL },
1611
          { NULL, NULL, NULL, NULL },
1612
          { NULL, NULL, NULL, NULL },
1613
      },
1614
      { { NULL, NULL, NULL, NULL },
1615
        { NULL, NULL, NULL, NULL },
1616
        { NULL, NULL, NULL, NULL } },
1617
      {
1618
          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
1619
          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
1620
          { NULL, NULL, NULL, NULL },
1621
      },
1622
      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
1623
        { NULL, NULL, NULL, NULL },
1624
        { NULL, NULL, NULL, NULL } },
1625
      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
1626
          idct64_low32_avx2 },
1627
        { NULL, NULL, NULL, NULL },
1628
        { NULL, NULL, NULL, NULL } }
1629
    };
1630
1631
// only process w >= 16 h >= 16
1632
static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
1633
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1634
1.74M
    TX_SIZE tx_size, int eob) {
1635
1.74M
  __m256i buf1[64 * 16];
1636
1.74M
  int eobx, eoby;
1637
1.74M
  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
1638
1.74M
  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1639
1.74M
  const int txw_idx = get_txw_idx(tx_size);
1640
1.74M
  const int txh_idx = get_txh_idx(tx_size);
1641
1.74M
  const int txfm_size_col = tx_size_wide[tx_size];
1642
1.74M
  const int txfm_size_row = tx_size_high[tx_size];
1643
1.74M
  const int buf_size_w_div16 = txfm_size_col >> 4;
1644
1.74M
  const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
1645
1.74M
  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
1646
1.74M
  const int input_stride = AOMMIN(32, txfm_size_row);
1647
1.74M
  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1648
1649
1.74M
  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1650
1.74M
  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1651
1.74M
  const transform_1d_avx2 row_txfm =
1652
1.74M
      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1653
1.74M
  const transform_1d_avx2 col_txfm =
1654
1.74M
      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1655
1656
1.74M
  assert(col_txfm != NULL);
1657
1.74M
  assert(row_txfm != NULL);
1658
1.74M
  int ud_flip, lr_flip;
1659
1.74M
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1660
1.74M
  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
1661
3.55M
  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
1662
1.81M
    __m256i buf0[64];
1663
1.81M
    load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
1664
1.81M
                                        buf_size_nonzero_w);
1665
1.81M
    if (rect_type == 1 || rect_type == -1) {
1666
300k
      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
1667
300k
    }
1668
1.81M
    row_txfm(buf0, buf0);
1669
54.1M
    for (int j = 0; j < txfm_size_col; ++j) {
1670
52.3M
      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
1671
52.3M
    }
1672
1673
1.81M
    __m256i *buf1_cur = buf1 + (i << 4);
1674
1.81M
    if (lr_flip) {
1675
60.3k
      for (int j = 0; j < buf_size_w_div16; ++j) {
1676
30.1k
        __m256i temp[16];
1677
30.1k
        flip_buf_avx2(buf0 + 16 * j, temp, 16);
1678
30.1k
        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
1679
30.1k
        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
1680
30.1k
      }
1681
1.78M
    } else {
1682
5.02M
      for (int j = 0; j < buf_size_w_div16; ++j) {
1683
3.24M
        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
1684
3.24M
      }
1685
1.78M
    }
1686
1.81M
  }
1687
1.74M
  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
1688
4.85M
  for (int i = 0; i < buf_size_w_div16; i++) {
1689
3.10M
    __m256i *buf1_cur = buf1 + i * txfm_size_row;
1690
3.10M
    col_txfm(buf1_cur, buf1_cur);
1691
88.4M
    for (int j = 0; j < txfm_size_row; ++j) {
1692
85.2M
      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
1693
85.2M
    }
1694
3.10M
  }
1695
4.85M
  for (int i = 0; i < buf_size_w_div16; i++) {
1696
3.10M
    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
1697
3.10M
                                 stride, ud_flip, txfm_size_row);
1698
3.10M
  }
1699
1.74M
}
1700
1701
static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
1702
                                           int stride, int shift, int height,
1703
57.7k
                                           int txw_idx, int rect_type) {
1704
57.7k
  const int32_t *input_row = input;
1705
57.7k
  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
1706
57.7k
  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
1707
57.7k
                                       (1 << (NewSqrt2Bits - shift - 1)));
1708
57.7k
  const __m256i one = _mm256_set1_epi16(1);
1709
57.7k
  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
1710
57.7k
  if (rect_type != 1 && rect_type != -1) {
1711
761k
    for (int i = 0; i < height; ++i) {
1712
717k
      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1713
717k
      input_row += stride;
1714
717k
      __m256i lo = _mm256_unpacklo_epi16(src, one);
1715
717k
      __m256i hi = _mm256_unpackhi_epi16(src, one);
1716
717k
      lo = _mm256_madd_epi16(lo, scale__r);
1717
717k
      hi = _mm256_madd_epi16(hi, scale__r);
1718
717k
      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1719
717k
      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1720
717k
      out[i] = _mm256_packs_epi32(lo, hi);
1721
717k
    }
1722
44.8k
  } else {
1723
12.9k
    const __m256i rect_scale =
1724
12.9k
        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
1725
219k
    for (int i = 0; i < height; ++i) {
1726
206k
      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1727
206k
      src = _mm256_mulhrs_epi16(src, rect_scale);
1728
206k
      input_row += stride;
1729
206k
      __m256i lo = _mm256_unpacklo_epi16(src, one);
1730
206k
      __m256i hi = _mm256_unpackhi_epi16(src, one);
1731
206k
      lo = _mm256_madd_epi16(lo, scale__r);
1732
206k
      hi = _mm256_madd_epi16(hi, scale__r);
1733
206k
      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1734
206k
      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1735
206k
      out[i] = _mm256_packs_epi32(lo, hi);
1736
206k
    }
1737
12.9k
  }
1738
57.7k
}
1739
1740
static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
1741
                                           __m256i *buf, int shift, int height,
1742
66.2k
                                           int txh_idx) {
1743
66.2k
  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
1744
66.2k
  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
1745
66.2k
  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
1746
66.2k
  const __m256i one = _mm256_set1_epi16(1);
1747
66.2k
  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
1748
1.12M
  for (int h = 0; h < height; ++h) {
1749
1.05M
    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
1750
1.05M
    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
1751
1.05M
    lo = _mm256_madd_epi16(lo, scale_coeff);
1752
1.05M
    hi = _mm256_madd_epi16(hi, scale_coeff);
1753
1.05M
    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
1754
1.05M
    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
1755
1.05M
    lo = _mm256_add_epi32(lo, shift__r);
1756
1.05M
    hi = _mm256_add_epi32(hi, shift__r);
1757
1.05M
    lo = _mm256_srai_epi32(lo, -shift);
1758
1.05M
    hi = _mm256_srai_epi32(hi, -shift);
1759
1.05M
    const __m256i x = _mm256_packs_epi32(lo, hi);
1760
1.05M
    write_recon_w16_avx2(x, output);
1761
1.05M
    output += stride;
1762
1.05M
  }
1763
66.2k
}
1764
1765
static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
1766
                                                  uint8_t *output, int stride,
1767
                                                  TX_SIZE tx_size,
1768
33.6k
                                                  int32_t eob) {
1769
33.6k
  (void)eob;
1770
33.6k
  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1771
33.6k
  const int txw_idx = get_txw_idx(tx_size);
1772
33.6k
  const int txh_idx = get_txh_idx(tx_size);
1773
33.6k
  const int txfm_size_col = tx_size_wide[tx_size];
1774
33.6k
  const int txfm_size_row = tx_size_high[tx_size];
1775
33.6k
  const int col_max = AOMMIN(32, txfm_size_col);
1776
33.6k
  const int row_max = AOMMIN(32, txfm_size_row);
1777
33.6k
  const int input_stride = row_max;
1778
33.6k
  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1779
33.6k
  __m256i buf[32];
1780
1781
73.0k
  for (int i = 0; i < (col_max >> 4); ++i) {
1782
85.7k
    for (int j = 0; j < (row_max >> 4); j++) {
1783
46.2k
      iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
1784
46.2k
                              row_max, shift[0], 16, txw_idx, rect_type);
1785
46.2k
      transpose_16bit_16x16_avx2(buf, buf);
1786
46.2k
      iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
1787
46.2k
                              shift[1], 16, txh_idx);
1788
46.2k
    }
1789
39.4k
  }
1790
33.6k
}
1791
1792
static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
1793
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1794
11.4k
    TX_SIZE tx_size, int eob) {
1795
11.4k
  int eobx, eoby;
1796
11.4k
  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
1797
11.4k
  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1798
11.4k
  const int txw_idx = get_txw_idx(tx_size);
1799
11.4k
  const int txh_idx = get_txh_idx(tx_size);
1800
11.4k
  const int txfm_size_col = tx_size_wide[tx_size];
1801
11.4k
  const int txfm_size_row = tx_size_high[tx_size];
1802
11.4k
  const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
1803
11.4k
  const int input_stride = txfm_size_row_notzero;
1804
11.4k
  const int buf_size_w_div16 = (eobx + 16) >> 4;
1805
11.4k
  const int buf_size_h_div16 = (eoby + 16) >> 4;
1806
11.4k
  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1807
1808
11.4k
  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1809
11.4k
  const transform_1d_avx2 col_txfm =
1810
11.4k
      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1811
1812
11.4k
  assert(col_txfm != NULL);
1813
1814
11.4k
  int ud_flip, lr_flip;
1815
11.4k
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1816
22.8k
  for (int i = 0; i < buf_size_w_div16; i++) {
1817
11.4k
    __m256i buf0[64];
1818
22.8k
    for (int j = 0; j < buf_size_h_div16; j++) {
1819
11.4k
      __m256i *buf0_cur = buf0 + j * 16;
1820
11.4k
      const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
1821
11.4k
      iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
1822
11.4k
                              txw_idx, rect_type);
1823
11.4k
      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1824
11.4k
    }
1825
11.4k
    col_txfm(buf0, buf0);
1826
11.4k
    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
1827
11.4k
    int k = ud_flip ? (txfm_size_row - 1) : 0;
1828
11.4k
    const int step = ud_flip ? -1 : 1;
1829
194k
    for (int j = 0; j < txfm_size_row; ++j, k += step) {
1830
182k
      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
1831
182k
      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
1832
182k
    }
1833
11.4k
  }
1834
11.4k
}
1835
1836
static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
1837
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1838
19.9k
    TX_SIZE tx_size, int eob) {
1839
19.9k
  __m256i buf1[64];
1840
19.9k
  int eobx, eoby;
1841
19.9k
  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
1842
19.9k
  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1843
19.9k
  const int txw_idx = get_txw_idx(tx_size);
1844
19.9k
  const int txh_idx = get_txh_idx(tx_size);
1845
19.9k
  const int txfm_size_col = tx_size_wide[tx_size];
1846
19.9k
  const int txfm_size_row = tx_size_high[tx_size];
1847
19.9k
  const int buf_size_w_div16 = txfm_size_col >> 4;
1848
19.9k
  const int buf_size_h_div16 = (eoby + 16) >> 4;
1849
19.9k
  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
1850
19.9k
  const int input_stride = AOMMIN(32, txfm_size_row);
1851
19.9k
  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1852
1853
19.9k
  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1854
19.9k
  const transform_1d_avx2 row_txfm =
1855
19.9k
      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1856
1857
19.9k
  assert(row_txfm != NULL);
1858
1859
19.9k
  int ud_flip, lr_flip;
1860
19.9k
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1861
39.8k
  for (int i = 0; i < buf_size_h_div16; i++) {
1862
19.9k
    __m256i buf0[64];
1863
19.9k
    load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
1864
19.9k
                                        buf_size_nonzero_w);
1865
19.9k
    if (rect_type == 1 || rect_type == -1) {
1866
0
      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
1867
0
    }
1868
19.9k
    row_txfm(buf0, buf0);
1869
19.9k
    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1870
19.9k
    __m256i *_buf1 = buf1;
1871
19.9k
    if (lr_flip) {
1872
0
      for (int j = 0; j < buf_size_w_div16; ++j) {
1873
0
        __m256i temp[16];
1874
0
        flip_buf_avx2(buf0 + 16 * j, temp, 16);
1875
0
        transpose_16bit_16x16_avx2(temp,
1876
0
                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
1877
0
      }
1878
19.9k
    } else {
1879
39.8k
      for (int j = 0; j < buf_size_w_div16; ++j) {
1880
19.9k
        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
1881
19.9k
      }
1882
19.9k
    }
1883
39.8k
    for (int j = 0; j < buf_size_w_div16; ++j) {
1884
19.9k
      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
1885
19.9k
                              buf1 + j * 16, shift[1], 16, txh_idx);
1886
19.9k
    }
1887
19.9k
  }
1888
19.9k
}
1889
1890
static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = {
1891
  { av1_idct8_low1_ssse3, av1_idct8_sse2 },
1892
  { av1_iadst8_low1_ssse3, av1_iadst8_sse2 }
1893
};
1894
1895
static INLINE void load_buffer_avx2(const int32_t *in, int stride,
1896
956k
                                    __m128i *out) {
1897
956k
  const __m256i a = _mm256_load_si256((const __m256i *)in);
1898
956k
  const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
1899
956k
  const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
1900
956k
  const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
1901
956k
  const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
1902
956k
  const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
1903
956k
  const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
1904
956k
  const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
1905
1906
  // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
1907
956k
  const __m256i ab_16bit = _mm256_packs_epi32(a, b);
1908
  // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
1909
956k
  const __m256i cd_16bit = _mm256_packs_epi32(c, d);
1910
  // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
1911
956k
  const __m256i ef_16bit = _mm256_packs_epi32(e, f);
1912
  // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
1913
956k
  const __m256i gh_16bit = _mm256_packs_epi32(g, h);
1914
1915
  // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
1916
956k
  const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
1917
  // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
1918
956k
  const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
1919
  // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
1920
956k
  const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
1921
  // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
1922
956k
  const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
1923
1924
956k
  out[0] = _mm256_castsi256_si128(ab);
1925
956k
  out[1] = _mm256_extractf128_si256(ab, 1);
1926
956k
  out[2] = _mm256_castsi256_si128(cd);
1927
956k
  out[3] = _mm256_extractf128_si256(cd, 1);
1928
956k
  out[4] = _mm256_castsi256_si128(ef);
1929
956k
  out[5] = _mm256_extractf128_si256(ef, 1);
1930
956k
  out[6] = _mm256_castsi256_si128(gh);
1931
956k
  out[7] = _mm256_extractf128_si256(gh, 1);
1932
956k
}
1933
1934
static INLINE void round_and_transpose_avx2(const __m128i *const in,
1935
                                            __m128i *const out, int bit,
1936
956k
                                            int *lr_flip) {
1937
956k
  __m256i buf_temp[4];
1938
956k
  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
1939
956k
  int j = *lr_flip ? 7 : 0;
1940
956k
  const int step = *lr_flip ? -1 : 1;
1941
1942
  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
1943
956k
  buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1944
956k
                                        in[j + 4 * step], 1);
1945
956k
  j += step;
1946
  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
1947
956k
  buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1948
956k
                                        in[j + 4 * step], 1);
1949
956k
  j += step;
1950
  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
1951
956k
  buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1952
956k
                                        in[j + 4 * step], 1);
1953
956k
  j += step;
1954
  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
1955
956k
  buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1956
956k
                                        in[j + 4 * step], 1);
1957
1958
  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
1959
956k
  buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
1960
  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
1961
956k
  buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
1962
  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
1963
956k
  buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
1964
  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
1965
956k
  buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
1966
1967
  // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
1968
956k
  const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
1969
  // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
1970
956k
  const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
1971
  // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
1972
956k
  const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
1973
  // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
1974
956k
  const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
1975
1976
  // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
1977
956k
  const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
1978
  // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
1979
956k
  const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
1980
  // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
1981
956k
  const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
1982
  // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
1983
956k
  const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
1984
1985
  // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
1986
956k
  const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
1987
  // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
1988
956k
  const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
1989
  // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
1990
956k
  const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
1991
  // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
1992
956k
  const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
1993
1994
  // 70 60 50 40 30 20 10 00
1995
956k
  out[0] = _mm256_castsi256_si128(reg_00);
1996
  // 71 61 51 41 31 21 11 01
1997
956k
  out[1] = _mm256_extracti128_si256(reg_00, 1);
1998
  // 72 62 52 42 32 22 12 02
1999
956k
  out[2] = _mm256_castsi256_si128(reg_01);
2000
  // 73 63 53 43 33 23 13 03
2001
956k
  out[3] = _mm256_extracti128_si256(reg_01, 1);
2002
  // 74 64 54 44 34 24 14 04
2003
956k
  out[4] = _mm256_castsi256_si128(reg_10);
2004
  // 75 65 55 45 35 25 15 05
2005
956k
  out[5] = _mm256_extracti128_si256(reg_10, 1);
2006
  // 76 66 56 46 36 26 16 06
2007
956k
  out[6] = _mm256_castsi256_si128(reg_11);
2008
  // 77 67 57 47 37 27 17 07
2009
956k
  out[7] = _mm256_extracti128_si256(reg_11, 1);
2010
956k
}
2011
2012
static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
2013
                                                       uint8_t *output,
2014
956k
                                                       int stride, int flipud) {
2015
956k
  __m256i in_256[4], v_256[4];
2016
956k
  int j = flipud ? 7 : 0;
2017
956k
  const int step = flipud ? -1 : 1;
2018
956k
  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
2019
956k
  const __m256i zero = _mm256_setzero_si256();
2020
  // in[0], in[1]
2021
956k
  in_256[0] =
2022
956k
      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2023
956k
  j += 2 * step;
2024
  // in[2], in[3]
2025
956k
  in_256[1] =
2026
956k
      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2027
956k
  j += 2 * step;
2028
  // in[4], in[5]
2029
956k
  in_256[2] =
2030
956k
      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2031
956k
  j += 2 * step;
2032
  // in[6], in[7]
2033
956k
  in_256[3] =
2034
956k
      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2035
2036
  // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
2037
956k
  in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
2038
  // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
2039
956k
  in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
2040
  // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
2041
956k
  in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
2042
  // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
2043
956k
  in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
2044
2045
956k
  const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
2046
956k
  const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
2047
956k
  const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
2048
956k
  const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
2049
956k
  const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
2050
956k
  const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
2051
956k
  const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
2052
956k
  const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
2053
2054
956k
  v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
2055
956k
  v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
2056
956k
  v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
2057
956k
  v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
2058
2059
956k
  const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
2060
956k
  const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
2061
956k
  const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
2062
956k
  const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
2063
  // 00 01 10 11
2064
956k
  const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
2065
  // 20 21 30 31
2066
956k
  const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
2067
  // 40 41 50 51
2068
956k
  const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
2069
  // 60 61 70 71
2070
956k
  const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
2071
2072
  // 00 01 20 21 10 11 30 31
2073
956k
  const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
2074
  // 40 41 60 61 50 51 70 71
2075
956k
  const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
2076
2077
  // 00 01 20 21
2078
956k
  const __m128i res_02 = _mm256_castsi256_si128(res_0123);
2079
  // 10 11 30 31
2080
956k
  const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
2081
  // 40 41 60 61
2082
956k
  const __m128i res_46 = _mm256_castsi256_si128(res_4567);
2083
  // 50 51 70 71
2084
956k
  const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
2085
2086
  // 00 01
2087
956k
  _mm_storel_epi64((__m128i *)(output), res_02);
2088
  // 10 11
2089
956k
  _mm_storel_epi64((__m128i *)(output + stride), res_13);
2090
  // 20 21
2091
956k
  _mm_storel_epi64((__m128i *)(output + 2 * stride),
2092
956k
                   _mm_unpackhi_epi64(res_02, res_02));
2093
  // 30 31
2094
956k
  _mm_storel_epi64((__m128i *)(output + 3 * stride),
2095
956k
                   _mm_unpackhi_epi64(res_13, res_13));
2096
  // 40 41
2097
956k
  _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
2098
  // 50 51
2099
956k
  _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
2100
  // 60 61
2101
956k
  _mm_storel_epi64((__m128i *)(output + 6 * stride),
2102
956k
                   _mm_unpackhi_epi64(res_46, res_46));
2103
  // 70 71
2104
956k
  _mm_storel_epi64((__m128i *)(output + 7 * stride),
2105
956k
                   _mm_unpackhi_epi64(res_57, res_57));
2106
956k
}
2107
2108
// AVX2 implementation has the advantage when combined multiple operations
2109
// together.
2110
static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2(
2111
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2112
956k
    TX_SIZE tx_size, int eob) {
2113
956k
  __m128i buf1[8];
2114
956k
  const int input_stride = 8;
2115
956k
  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2116
956k
  assert(hitx_1d_tab[tx_type] < 2);
2117
956k
  assert(vitx_1d_tab[tx_type] < 2);
2118
956k
  const transform_1d_ssse3 row_txfm =
2119
956k
      lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
2120
956k
  const transform_1d_ssse3 col_txfm =
2121
956k
      lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
2122
2123
956k
  assert(col_txfm != NULL);
2124
956k
  assert(row_txfm != NULL);
2125
956k
  int ud_flip, lr_flip;
2126
956k
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2127
2128
956k
  __m128i buf0[8];
2129
956k
  __m128i *buf0_cur = buf0;
2130
956k
  load_buffer_avx2(input, input_stride, buf0_cur);
2131
956k
  row_txfm(buf0, buf0);
2132
2133
956k
  assert(shift[0] < 0);
2134
956k
  __m128i *_buf1 = buf1;
2135
956k
  round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
2136
956k
  assert(shift[1] < 0);
2137
956k
  col_txfm(buf1, buf1);
2138
956k
  round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
2139
956k
}
2140
2141
// AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for
2142
// tx_type with identity in either of the direction has no advantage.
2143
static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
2144
                                          int stride, TX_TYPE tx_type,
2145
1.18M
                                          TX_SIZE tx_size, int eob) {
2146
1.18M
  switch (tx_type) {
2147
118k
    case IDTX:
2148
118k
      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
2149
2150
118k
      break;
2151
22.8k
    case V_DCT:
2152
32.8k
    case V_ADST:
2153
39.2k
    case V_FLIPADST:
2154
39.2k
      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
2155
39.2k
                                                tx_size, eob);
2156
39.2k
      break;
2157
69.2k
    case H_DCT:
2158
72.0k
    case H_ADST:
2159
75.8k
    case H_FLIPADST:
2160
75.8k
      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
2161
75.8k
                                                tx_size, eob);
2162
75.8k
      break;
2163
956k
    default:
2164
956k
      lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
2165
956k
                                            tx_size, eob);
2166
1.18M
  }
2167
1.18M
}
2168
2169
// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
2170
static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
2171
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2172
1.81M
    TX_SIZE tx_size, int eob) {
2173
1.81M
  (void)eob;
2174
1.81M
  switch (tx_type) {
2175
1.29M
    case DCT_DCT:
2176
1.40M
    case ADST_DCT:   // ADST in vertical, DCT in horizontal
2177
1.59M
    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
2178
1.69M
    case ADST_ADST:  // ADST in both directions
2179
1.70M
    case FLIPADST_DCT:
2180
1.71M
    case DCT_FLIPADST:
2181
1.72M
    case FLIPADST_FLIPADST:
2182
1.73M
    case ADST_FLIPADST:
2183
1.74M
    case FLIPADST_ADST:
2184
1.74M
      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
2185
1.74M
                                            tx_size, eob);
2186
1.74M
      break;
2187
33.6k
    case IDTX:
2188
33.6k
      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
2189
33.6k
      break;
2190
11.4k
    case V_DCT:
2191
11.4k
    case V_ADST:
2192
11.4k
    case V_FLIPADST:
2193
11.4k
      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
2194
11.4k
                                           tx_size, eob);
2195
11.4k
      break;
2196
19.9k
    case H_DCT:
2197
19.9k
    case H_ADST:
2198
19.9k
    case H_FLIPADST:
2199
19.9k
      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
2200
19.9k
                                           tx_size, eob);
2201
19.9k
      break;
2202
0
    default:
2203
0
      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
2204
0
                                     eob);
2205
0
      break;
2206
1.81M
  }
2207
1.81M
}
2208
2209
void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
2210
                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
2211
7.47M
                                   int eob) {
2212
7.47M
  switch (tx_size) {
2213
1.00M
    case TX_4X4:
2214
1.51M
    case TX_4X8:
2215
2.24M
    case TX_8X4:
2216
2.54M
    case TX_8X16:
2217
3.10M
    case TX_16X8:
2218
3.35M
    case TX_4X16:
2219
3.97M
    case TX_16X4:
2220
4.07M
    case TX_8X32:
2221
4.47M
    case TX_32X8:
2222
4.47M
      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
2223
4.47M
                                     eob);
2224
4.47M
      break;
2225
1.18M
    case TX_8X8:
2226
1.18M
      lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
2227
1.18M
                                    eob);
2228
1.18M
      break;
2229
864k
    case TX_16X16:
2230
1.27M
    case TX_32X32:
2231
1.35M
    case TX_64X64:
2232
1.45M
    case TX_16X32:
2233
1.60M
    case TX_32X16:
2234
1.62M
    case TX_32X64:
2235
1.65M
    case TX_64X32:
2236
1.66M
    case TX_16X64:
2237
1.81M
    case TX_64X16:
2238
1.81M
    default:
2239
1.81M
      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
2240
1.81M
                                         tx_size, eob);
2241
1.81M
      break;
2242
7.47M
  }
2243
7.47M
}
2244
2245
void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2246
9.02M
                           const TxfmParam *txfm_param) {
2247
9.02M
  const TX_TYPE tx_type = txfm_param->tx_type;
2248
9.02M
  if (!txfm_param->lossless) {
2249
7.47M
    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
2250
7.47M
                                  txfm_param->tx_size, txfm_param->eob);
2251
7.47M
  } else {
2252
1.54M
    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2253
1.54M
  }
2254
9.02M
}