Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
13
#include "./vpx_dsp_rtcd.h"
14
#include "vpx/vpx_integer.h"
15
#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
16
#include "vpx_ports/mem.h"
17
18
#if CONFIG_VP9_HIGHBITDEPTH
19
0
static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
20
0
  __m256i a0 = in[0];
21
0
  __m256i a1 = in[1];
22
0
  __m256i a2 = in[2];
23
0
  __m256i a3 = in[3];
24
0
  __m256i a4 = in[4];
25
0
  __m256i a5 = in[5];
26
0
  __m256i a6 = in[6];
27
0
  __m256i a7 = in[7];
28
29
0
  __m256i b0 = _mm256_add_epi32(a0, a1);
30
0
  __m256i b1 = _mm256_sub_epi32(a0, a1);
31
0
  __m256i b2 = _mm256_add_epi32(a2, a3);
32
0
  __m256i b3 = _mm256_sub_epi32(a2, a3);
33
0
  __m256i b4 = _mm256_add_epi32(a4, a5);
34
0
  __m256i b5 = _mm256_sub_epi32(a4, a5);
35
0
  __m256i b6 = _mm256_add_epi32(a6, a7);
36
0
  __m256i b7 = _mm256_sub_epi32(a6, a7);
37
38
0
  a0 = _mm256_add_epi32(b0, b2);
39
0
  a1 = _mm256_add_epi32(b1, b3);
40
0
  a2 = _mm256_sub_epi32(b0, b2);
41
0
  a3 = _mm256_sub_epi32(b1, b3);
42
0
  a4 = _mm256_add_epi32(b4, b6);
43
0
  a5 = _mm256_add_epi32(b5, b7);
44
0
  a6 = _mm256_sub_epi32(b4, b6);
45
0
  a7 = _mm256_sub_epi32(b5, b7);
46
47
0
  if (iter == 0) {
48
0
    b0 = _mm256_add_epi32(a0, a4);
49
0
    b7 = _mm256_add_epi32(a1, a5);
50
0
    b3 = _mm256_add_epi32(a2, a6);
51
0
    b4 = _mm256_add_epi32(a3, a7);
52
0
    b2 = _mm256_sub_epi32(a0, a4);
53
0
    b6 = _mm256_sub_epi32(a1, a5);
54
0
    b1 = _mm256_sub_epi32(a2, a6);
55
0
    b5 = _mm256_sub_epi32(a3, a7);
56
57
0
    a0 = _mm256_unpacklo_epi32(b0, b1);
58
0
    a1 = _mm256_unpacklo_epi32(b2, b3);
59
0
    a2 = _mm256_unpackhi_epi32(b0, b1);
60
0
    a3 = _mm256_unpackhi_epi32(b2, b3);
61
0
    a4 = _mm256_unpacklo_epi32(b4, b5);
62
0
    a5 = _mm256_unpacklo_epi32(b6, b7);
63
0
    a6 = _mm256_unpackhi_epi32(b4, b5);
64
0
    a7 = _mm256_unpackhi_epi32(b6, b7);
65
66
0
    b0 = _mm256_unpacklo_epi64(a0, a1);
67
0
    b1 = _mm256_unpacklo_epi64(a4, a5);
68
0
    b2 = _mm256_unpackhi_epi64(a0, a1);
69
0
    b3 = _mm256_unpackhi_epi64(a4, a5);
70
0
    b4 = _mm256_unpacklo_epi64(a2, a3);
71
0
    b5 = _mm256_unpacklo_epi64(a6, a7);
72
0
    b6 = _mm256_unpackhi_epi64(a2, a3);
73
0
    b7 = _mm256_unpackhi_epi64(a6, a7);
74
75
0
    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
76
0
    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
77
0
    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
78
0
    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
79
0
    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
80
0
    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
81
0
    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
82
0
    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
83
0
  } else {
84
0
    in[0] = _mm256_add_epi32(a0, a4);
85
0
    in[7] = _mm256_add_epi32(a1, a5);
86
0
    in[3] = _mm256_add_epi32(a2, a6);
87
0
    in[4] = _mm256_add_epi32(a3, a7);
88
0
    in[2] = _mm256_sub_epi32(a0, a4);
89
0
    in[6] = _mm256_sub_epi32(a1, a5);
90
0
    in[1] = _mm256_sub_epi32(a2, a6);
91
0
    in[5] = _mm256_sub_epi32(a3, a7);
92
0
  }
93
0
}
94
95
void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
96
0
                                  tran_low_t *coeff) {
97
0
  __m128i src16[8];
98
0
  __m256i src32[8];
99
100
0
  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
101
0
  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
102
0
  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
103
0
  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
104
0
  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
105
0
  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
106
0
  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
107
0
  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
108
109
0
  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
110
0
  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
111
0
  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
112
0
  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
113
0
  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
114
0
  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
115
0
  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
116
0
  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
117
118
0
  highbd_hadamard_col8_avx2(src32, 0);
119
0
  highbd_hadamard_col8_avx2(src32, 1);
120
121
0
  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
122
0
  coeff += 8;
123
0
  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
124
0
  coeff += 8;
125
0
  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
126
0
  coeff += 8;
127
0
  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
128
0
  coeff += 8;
129
0
  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
130
0
  coeff += 8;
131
0
  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
132
0
  coeff += 8;
133
0
  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
134
0
  coeff += 8;
135
0
  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
136
0
}
137
138
void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
139
0
                                    ptrdiff_t src_stride, tran_low_t *coeff) {
140
0
  int idx;
141
0
  tran_low_t *t_coeff = coeff;
142
0
  for (idx = 0; idx < 4; ++idx) {
143
0
    const int16_t *src_ptr =
144
0
        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
145
0
    vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
146
0
  }
147
148
0
  for (idx = 0; idx < 64; idx += 8) {
149
0
    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
150
0
    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
151
0
    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
152
0
    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
153
154
0
    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
155
0
    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
156
0
    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
157
0
    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
158
159
0
    b0 = _mm256_srai_epi32(b0, 1);
160
0
    b1 = _mm256_srai_epi32(b1, 1);
161
0
    b2 = _mm256_srai_epi32(b2, 1);
162
0
    b3 = _mm256_srai_epi32(b3, 1);
163
164
0
    coeff0 = _mm256_add_epi32(b0, b2);
165
0
    coeff1 = _mm256_add_epi32(b1, b3);
166
0
    coeff2 = _mm256_sub_epi32(b0, b2);
167
0
    coeff3 = _mm256_sub_epi32(b1, b3);
168
169
0
    _mm256_storeu_si256((__m256i *)coeff, coeff0);
170
0
    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
171
0
    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
172
0
    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
173
174
0
    coeff += 8;
175
0
    t_coeff += 8;
176
0
  }
177
0
}
178
179
void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
180
0
                                    ptrdiff_t src_stride, tran_low_t *coeff) {
181
0
  int idx;
182
0
  tran_low_t *t_coeff = coeff;
183
0
  for (idx = 0; idx < 4; ++idx) {
184
0
    const int16_t *src_ptr =
185
0
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
186
0
    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
187
0
  }
188
189
0
  for (idx = 0; idx < 256; idx += 8) {
190
0
    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
191
0
    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
192
0
    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
193
0
    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
194
195
0
    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
196
0
    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
197
0
    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
198
0
    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
199
200
0
    b0 = _mm256_srai_epi32(b0, 2);
201
0
    b1 = _mm256_srai_epi32(b1, 2);
202
0
    b2 = _mm256_srai_epi32(b2, 2);
203
0
    b3 = _mm256_srai_epi32(b3, 2);
204
205
0
    coeff0 = _mm256_add_epi32(b0, b2);
206
0
    coeff1 = _mm256_add_epi32(b1, b3);
207
0
    coeff2 = _mm256_sub_epi32(b0, b2);
208
0
    coeff3 = _mm256_sub_epi32(b1, b3);
209
210
0
    _mm256_storeu_si256((__m256i *)coeff, coeff0);
211
0
    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
212
0
    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
213
0
    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
214
215
0
    coeff += 8;
216
0
    t_coeff += 8;
217
0
  }
218
0
}
219
#endif  // CONFIG_VP9_HIGHBITDEPTH
220
221
static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
222
                                                   __m256i *out_lo,
223
0
                                                   __m256i *out_hi) {
224
0
  const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
225
0
  *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
226
0
  *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
227
0
}
228
229
0
static void hadamard_col8x2_avx2(__m256i *in, int iter) {
230
0
  __m256i a0 = in[0];
231
0
  __m256i a1 = in[1];
232
0
  __m256i a2 = in[2];
233
0
  __m256i a3 = in[3];
234
0
  __m256i a4 = in[4];
235
0
  __m256i a5 = in[5];
236
0
  __m256i a6 = in[6];
237
0
  __m256i a7 = in[7];
238
239
0
  __m256i b0 = _mm256_add_epi16(a0, a1);
240
0
  __m256i b1 = _mm256_sub_epi16(a0, a1);
241
0
  __m256i b2 = _mm256_add_epi16(a2, a3);
242
0
  __m256i b3 = _mm256_sub_epi16(a2, a3);
243
0
  __m256i b4 = _mm256_add_epi16(a4, a5);
244
0
  __m256i b5 = _mm256_sub_epi16(a4, a5);
245
0
  __m256i b6 = _mm256_add_epi16(a6, a7);
246
0
  __m256i b7 = _mm256_sub_epi16(a6, a7);
247
248
0
  a0 = _mm256_add_epi16(b0, b2);
249
0
  a1 = _mm256_add_epi16(b1, b3);
250
0
  a2 = _mm256_sub_epi16(b0, b2);
251
0
  a3 = _mm256_sub_epi16(b1, b3);
252
0
  a4 = _mm256_add_epi16(b4, b6);
253
0
  a5 = _mm256_add_epi16(b5, b7);
254
0
  a6 = _mm256_sub_epi16(b4, b6);
255
0
  a7 = _mm256_sub_epi16(b5, b7);
256
257
0
  if (iter == 0) {
258
0
    b0 = _mm256_add_epi16(a0, a4);
259
0
    b7 = _mm256_add_epi16(a1, a5);
260
0
    b3 = _mm256_add_epi16(a2, a6);
261
0
    b4 = _mm256_add_epi16(a3, a7);
262
0
    b2 = _mm256_sub_epi16(a0, a4);
263
0
    b6 = _mm256_sub_epi16(a1, a5);
264
0
    b1 = _mm256_sub_epi16(a2, a6);
265
0
    b5 = _mm256_sub_epi16(a3, a7);
266
267
0
    a0 = _mm256_unpacklo_epi16(b0, b1);
268
0
    a1 = _mm256_unpacklo_epi16(b2, b3);
269
0
    a2 = _mm256_unpackhi_epi16(b0, b1);
270
0
    a3 = _mm256_unpackhi_epi16(b2, b3);
271
0
    a4 = _mm256_unpacklo_epi16(b4, b5);
272
0
    a5 = _mm256_unpacklo_epi16(b6, b7);
273
0
    a6 = _mm256_unpackhi_epi16(b4, b5);
274
0
    a7 = _mm256_unpackhi_epi16(b6, b7);
275
276
0
    b0 = _mm256_unpacklo_epi32(a0, a1);
277
0
    b1 = _mm256_unpacklo_epi32(a4, a5);
278
0
    b2 = _mm256_unpackhi_epi32(a0, a1);
279
0
    b3 = _mm256_unpackhi_epi32(a4, a5);
280
0
    b4 = _mm256_unpacklo_epi32(a2, a3);
281
0
    b5 = _mm256_unpacklo_epi32(a6, a7);
282
0
    b6 = _mm256_unpackhi_epi32(a2, a3);
283
0
    b7 = _mm256_unpackhi_epi32(a6, a7);
284
285
0
    in[0] = _mm256_unpacklo_epi64(b0, b1);
286
0
    in[1] = _mm256_unpackhi_epi64(b0, b1);
287
0
    in[2] = _mm256_unpacklo_epi64(b2, b3);
288
0
    in[3] = _mm256_unpackhi_epi64(b2, b3);
289
0
    in[4] = _mm256_unpacklo_epi64(b4, b5);
290
0
    in[5] = _mm256_unpackhi_epi64(b4, b5);
291
0
    in[6] = _mm256_unpacklo_epi64(b6, b7);
292
0
    in[7] = _mm256_unpackhi_epi64(b6, b7);
293
0
  } else {
294
0
    in[0] = _mm256_add_epi16(a0, a4);
295
0
    in[7] = _mm256_add_epi16(a1, a5);
296
0
    in[3] = _mm256_add_epi16(a2, a6);
297
0
    in[4] = _mm256_add_epi16(a3, a7);
298
0
    in[2] = _mm256_sub_epi16(a0, a4);
299
0
    in[6] = _mm256_sub_epi16(a1, a5);
300
0
    in[1] = _mm256_sub_epi16(a2, a6);
301
0
    in[5] = _mm256_sub_epi16(a3, a7);
302
0
  }
303
0
}
304
305
static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
306
0
                                int16_t *coeff) {
307
0
  __m256i src[8];
308
0
  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
309
0
  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
310
0
  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
311
0
  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
312
0
  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
313
0
  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
314
0
  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
315
0
  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
316
317
0
  hadamard_col8x2_avx2(src, 0);
318
0
  hadamard_col8x2_avx2(src, 1);
319
320
0
  _mm256_storeu_si256((__m256i *)coeff,
321
0
                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
322
0
  coeff += 16;
323
0
  _mm256_storeu_si256((__m256i *)coeff,
324
0
                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
325
0
  coeff += 16;
326
0
  _mm256_storeu_si256((__m256i *)coeff,
327
0
                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
328
0
  coeff += 16;
329
0
  _mm256_storeu_si256((__m256i *)coeff,
330
0
                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
331
0
  coeff += 16;
332
0
  _mm256_storeu_si256((__m256i *)coeff,
333
0
                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
334
0
  coeff += 16;
335
0
  _mm256_storeu_si256((__m256i *)coeff,
336
0
                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
337
0
  coeff += 16;
338
0
  _mm256_storeu_si256((__m256i *)coeff,
339
0
                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
340
0
  coeff += 16;
341
0
  _mm256_storeu_si256((__m256i *)coeff,
342
0
                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
343
0
}
344
345
static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
346
                                       ptrdiff_t src_stride, tran_low_t *coeff,
347
0
                                       int is_final) {
348
0
#if CONFIG_VP9_HIGHBITDEPTH
349
0
  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
350
0
  int16_t *t_coeff = temp_coeff;
351
#else
352
  int16_t *t_coeff = coeff;
353
#endif
354
0
  int16_t *coeff16 = (int16_t *)coeff;
355
0
  int idx;
356
0
  for (idx = 0; idx < 2; ++idx) {
357
0
    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
358
0
    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
359
0
  }
360
361
0
  for (idx = 0; idx < 64; idx += 16) {
362
0
    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
363
0
    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
364
0
    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
365
0
    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
366
367
0
    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
368
0
    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
369
0
    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
370
0
    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
371
372
0
    b0 = _mm256_srai_epi16(b0, 1);
373
0
    b1 = _mm256_srai_epi16(b1, 1);
374
0
    b2 = _mm256_srai_epi16(b2, 1);
375
0
    b3 = _mm256_srai_epi16(b3, 1);
376
0
    if (is_final) {
377
0
      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
378
0
      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
379
0
      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
380
0
      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
381
0
      coeff += 16;
382
0
    } else {
383
0
      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
384
0
      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
385
0
      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
386
0
      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
387
0
      coeff16 += 16;
388
0
    }
389
0
    t_coeff += 16;
390
0
  }
391
0
}
392
393
void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
394
0
                             tran_low_t *coeff) {
395
0
  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
396
0
}
397
398
void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
399
0
                             tran_low_t *coeff) {
400
0
#if CONFIG_VP9_HIGHBITDEPTH
401
  // For high bitdepths, it is unnecessary to store_tran_low
402
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
403
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
404
  // in the final stage.
405
0
  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
406
0
  int16_t *t_coeff = temp_coeff;
407
#else
408
  int16_t *t_coeff = coeff;
409
#endif
410
0
  int idx;
411
0
  __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
412
0
      b3_lo;
413
0
  __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
414
0
      b3_hi;
415
0
  __m256i b0, b1, b2, b3;
416
0
  const __m256i zero = _mm256_setzero_si256();
417
0
  for (idx = 0; idx < 4; ++idx) {
418
    // src_diff: 9 bit, dynamic range [-255, 255]
419
0
    const int16_t *src_ptr =
420
0
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
421
0
    hadamard_16x16_avx2(src_ptr, src_stride,
422
0
                        (tran_low_t *)(t_coeff + idx * 256), 0);
423
0
  }
424
425
0
  for (idx = 0; idx < 256; idx += 16) {
426
0
    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
427
0
    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
428
0
    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
429
0
    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
430
431
    // Sign extend 16 bit to 32 bit.
432
0
    sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
433
0
    sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
434
0
    sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
435
0
    sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
436
437
0
    b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
438
0
    b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
439
440
0
    b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
441
0
    b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
442
443
0
    b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
444
0
    b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
445
446
0
    b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
447
0
    b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
448
449
0
    b0_lo = _mm256_srai_epi32(b0_lo, 2);
450
0
    b1_lo = _mm256_srai_epi32(b1_lo, 2);
451
0
    b2_lo = _mm256_srai_epi32(b2_lo, 2);
452
0
    b3_lo = _mm256_srai_epi32(b3_lo, 2);
453
454
0
    b0_hi = _mm256_srai_epi32(b0_hi, 2);
455
0
    b1_hi = _mm256_srai_epi32(b1_hi, 2);
456
0
    b2_hi = _mm256_srai_epi32(b2_hi, 2);
457
0
    b3_hi = _mm256_srai_epi32(b3_hi, 2);
458
459
0
    b0 = _mm256_packs_epi32(b0_lo, b0_hi);
460
0
    b1 = _mm256_packs_epi32(b1_lo, b1_hi);
461
0
    b2 = _mm256_packs_epi32(b2_lo, b2_hi);
462
0
    b3 = _mm256_packs_epi32(b3_lo, b3_hi);
463
464
0
    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
465
0
    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
466
0
    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
467
0
    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
468
469
0
    coeff += 16;
470
0
    t_coeff += 16;
471
0
  }
472
0
}
473
474
0
int vpx_satd_avx2(const tran_low_t *coeff, int length) {
475
0
  const __m256i one = _mm256_set1_epi16(1);
476
0
  __m256i accum = _mm256_setzero_si256();
477
0
  int i;
478
479
0
  for (i = 0; i < length; i += 16) {
480
0
    const __m256i src_line = load_tran_low(coeff);
481
0
    const __m256i abs = _mm256_abs_epi16(src_line);
482
0
    const __m256i sum = _mm256_madd_epi16(abs, one);
483
0
    accum = _mm256_add_epi32(accum, sum);
484
0
    coeff += 16;
485
0
  }
486
487
0
  {  // 32 bit horizontal add
488
0
    const __m256i a = _mm256_srli_si256(accum, 8);
489
0
    const __m256i b = _mm256_add_epi32(accum, a);
490
0
    const __m256i c = _mm256_srli_epi64(b, 32);
491
0
    const __m256i d = _mm256_add_epi32(b, c);
492
0
    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
493
0
                                            _mm256_extractf128_si256(d, 1));
494
0
    return _mm_cvtsi128_si32(accum_128);
495
0
  }
496
0
}
497
498
#if CONFIG_VP9_HIGHBITDEPTH
499
0
int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
500
0
  __m256i accum = _mm256_setzero_si256();
501
0
  int i;
502
503
0
  for (i = 0; i < length; i += 8, coeff += 8) {
504
0
    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
505
0
    const __m256i abs = _mm256_abs_epi32(src_line);
506
0
    accum = _mm256_add_epi32(accum, abs);
507
0
  }
508
509
0
  {  // 32 bit horizontal add
510
0
    const __m256i a = _mm256_srli_si256(accum, 8);
511
0
    const __m256i b = _mm256_add_epi32(accum, a);
512
0
    const __m256i c = _mm256_srli_epi64(b, 32);
513
0
    const __m256i d = _mm256_add_epi32(b, c);
514
0
    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
515
0
                                            _mm256_extractf128_si256(d, 1));
516
0
    return _mm_cvtsi128_si32(accum_128);
517
0
  }
518
0
}
519
#endif  // CONFIG_VP9_HIGHBITDEPTH