Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <immintrin.h>  // AVX2
11
#include "./vpx_dsp_rtcd.h"
12
#include "vpx/vpx_integer.h"
13
14
static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
15
0
                                          uint32_t sad_array[4]) {
16
0
  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
17
0
  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
18
0
  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
19
0
  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
20
0
                                    _mm256_extractf128_si256(t2, 1));
21
0
  _mm_storeu_si128((__m128i *)sad_array, sum);
22
0
}
23
24
static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
25
                                               const uint16_t *src,
26
                                               int src_stride,
27
                                               uint16_t *refs[4],
28
0
                                               int ref_stride, int height) {
29
0
  int i;
30
0
  for (i = 0; i < height; ++i) {
31
    // load src and all ref[]
32
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
33
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
34
0
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
35
0
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
36
0
    int x;
37
38
0
    for (x = 0; x < 4; ++x) {
39
0
      __m256i r[4];
40
0
      r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
41
0
      r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
42
0
      r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
43
0
      r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
44
45
      // absolute differences between every ref[] to src
46
0
      r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
47
0
      r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
48
0
      r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
49
0
      r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
50
51
      // sum every abs diff
52
0
      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
53
0
      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
54
0
    }
55
56
0
    src += src_stride;
57
0
    refs[0] += ref_stride;
58
0
    refs[1] += ref_stride;
59
0
    refs[2] += ref_stride;
60
0
    refs[3] += ref_stride;
61
0
  }
62
0
}
63
64
static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
65
    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
66
0
    int ref_stride, uint32_t sad_array[4], int n) {
67
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
68
0
  uint16_t *refs[4];
69
0
  __m256i sums_16[4];
70
0
  __m256i sums_32[4];
71
0
  int i;
72
73
0
  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
74
0
  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
75
0
  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
76
0
  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
77
0
  sums_32[0] = _mm256_setzero_si256();
78
0
  sums_32[1] = _mm256_setzero_si256();
79
0
  sums_32[2] = _mm256_setzero_si256();
80
0
  sums_32[3] = _mm256_setzero_si256();
81
82
0
  for (i = 0; i < (n / 2); ++i) {
83
0
    sums_16[0] = _mm256_setzero_si256();
84
0
    sums_16[1] = _mm256_setzero_si256();
85
0
    sums_16[2] = _mm256_setzero_si256();
86
0
    sums_16[3] = _mm256_setzero_si256();
87
88
0
    highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
89
90
    /* sums_16 will outrange after 2 rows, so add current sums_16 to
91
     * sums_32*/
92
0
    sums_32[0] = _mm256_add_epi32(
93
0
        sums_32[0],
94
0
        _mm256_add_epi32(
95
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
96
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
97
0
    sums_32[1] = _mm256_add_epi32(
98
0
        sums_32[1],
99
0
        _mm256_add_epi32(
100
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
101
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
102
0
    sums_32[2] = _mm256_add_epi32(
103
0
        sums_32[2],
104
0
        _mm256_add_epi32(
105
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
106
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
107
0
    sums_32[3] = _mm256_add_epi32(
108
0
        sums_32[3],
109
0
        _mm256_add_epi32(
110
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
111
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
112
113
0
    src += src_stride << 1;
114
0
  }
115
0
  calc_final_4(sums_32, sad_array);
116
0
}
117
118
#define HIGHBD_SAD64XNX4D(n)                                                   \
119
  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
120
                                      const uint8_t *const ref_array[4],       \
121
0
                                      int ref_stride, uint32_t sad_array[4]) { \
122
0
    highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
123
0
                           n);                                                 \
124
0
  }
Unexecuted instantiation: vpx_highbd_sad64x64x4d_avx2
Unexecuted instantiation: vpx_highbd_sad64x32x4d_avx2
125
126
#define HIGHBD_SADSKIP64XNx4D(n)                                             \
127
  void vpx_highbd_sad_skip_64x##n##x4d_avx2(                                 \
128
      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
129
0
      int ref_stride, uint32_t sad_array[4]) {                               \
130
0
    highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
131
0
                           sad_array, n / 2);                                \
132
0
    sad_array[0] <<= 1;                                                      \
133
0
    sad_array[1] <<= 1;                                                      \
134
0
    sad_array[2] <<= 1;                                                      \
135
0
    sad_array[3] <<= 1;                                                      \
136
0
  }
Unexecuted instantiation: vpx_highbd_sad_skip_64x64x4d_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_64x32x4d_avx2
137
138
static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
139
                                               const uint16_t *src,
140
                                               int src_stride,
141
                                               uint16_t *refs[4],
142
0
                                               int ref_stride, int height) {
143
0
  int i;
144
0
  for (i = 0; i < height; i++) {
145
0
    __m256i r[8];
146
147
    // load src and all ref[]
148
0
    const __m256i s = _mm256_load_si256((const __m256i *)src);
149
0
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
150
0
    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
151
0
    r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
152
0
    r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
153
0
    r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
154
0
    r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
155
0
    r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
156
0
    r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
157
0
    r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
158
159
    // absolute differences between every ref[] to src
160
0
    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
161
0
    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
162
0
    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
163
0
    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
164
0
    r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
165
0
    r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
166
0
    r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
167
0
    r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
168
169
    // sum every abs diff
170
0
    sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
171
0
    sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
172
0
    sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
173
0
    sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
174
175
0
    src += src_stride;
176
0
    refs[0] += ref_stride;
177
0
    refs[1] += ref_stride;
178
0
    refs[2] += ref_stride;
179
0
    refs[3] += ref_stride;
180
0
  }
181
0
}
182
183
static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
184
    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
185
0
    int ref_stride, uint32_t sad_array[4], int n) {
186
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
187
0
  uint16_t *refs[4];
188
0
  __m256i sums_16[4];
189
0
  __m256i sums_32[4];
190
0
  int i;
191
192
0
  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
193
0
  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
194
0
  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
195
0
  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
196
0
  sums_32[0] = _mm256_setzero_si256();
197
0
  sums_32[1] = _mm256_setzero_si256();
198
0
  sums_32[2] = _mm256_setzero_si256();
199
0
  sums_32[3] = _mm256_setzero_si256();
200
201
0
  for (i = 0; i < (n / 8); ++i) {
202
0
    sums_16[0] = _mm256_setzero_si256();
203
0
    sums_16[1] = _mm256_setzero_si256();
204
0
    sums_16[2] = _mm256_setzero_si256();
205
0
    sums_16[3] = _mm256_setzero_si256();
206
207
0
    highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
208
209
    /* sums_16 will outrange after 8 rows, so add current sums_16 to
210
     * sums_32*/
211
0
    sums_32[0] = _mm256_add_epi32(
212
0
        sums_32[0],
213
0
        _mm256_add_epi32(
214
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
215
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
216
0
    sums_32[1] = _mm256_add_epi32(
217
0
        sums_32[1],
218
0
        _mm256_add_epi32(
219
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
220
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
221
0
    sums_32[2] = _mm256_add_epi32(
222
0
        sums_32[2],
223
0
        _mm256_add_epi32(
224
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
225
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
226
0
    sums_32[3] = _mm256_add_epi32(
227
0
        sums_32[3],
228
0
        _mm256_add_epi32(
229
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
230
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
231
232
0
    src += src_stride << 3;
233
0
  }
234
0
  calc_final_4(sums_32, sad_array);
235
0
}
236
237
#define HIGHBD_SAD32XNX4D(n)                                                   \
238
  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
239
                                      const uint8_t *const ref_array[4],       \
240
0
                                      int ref_stride, uint32_t sad_array[4]) { \
241
0
    highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
242
0
                           n);                                                 \
243
0
  }
Unexecuted instantiation: vpx_highbd_sad32x64x4d_avx2
Unexecuted instantiation: vpx_highbd_sad32x32x4d_avx2
Unexecuted instantiation: vpx_highbd_sad32x16x4d_avx2
244
245
#define HIGHBD_SADSKIP32XNx4D(n)                                             \
246
  void vpx_highbd_sad_skip_32x##n##x4d_avx2(                                 \
247
      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
248
0
      int ref_stride, uint32_t sad_array[4]) {                               \
249
0
    highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
250
0
                           sad_array, n / 2);                                \
251
0
    sad_array[0] <<= 1;                                                      \
252
0
    sad_array[1] <<= 1;                                                      \
253
0
    sad_array[2] <<= 1;                                                      \
254
0
    sad_array[3] <<= 1;                                                      \
255
0
  }
Unexecuted instantiation: vpx_highbd_sad_skip_32x64x4d_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_32x32x4d_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_32x16x4d_avx2
256
257
static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
258
                                               const uint16_t *src,
259
                                               int src_stride,
260
                                               uint16_t *refs[4],
261
0
                                               int ref_stride, int height) {
262
0
  int i;
263
0
  for (i = 0; i < height; i++) {
264
0
    __m256i r[4];
265
266
    // load src and all ref[]
267
0
    const __m256i s = _mm256_load_si256((const __m256i *)src);
268
0
    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
269
0
    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
270
0
    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
271
0
    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
272
273
    // absolute differences between every ref[] to src
274
0
    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
275
0
    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
276
0
    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
277
0
    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
278
279
    // sum every abs diff
280
0
    sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
281
0
    sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
282
0
    sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
283
0
    sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
284
285
0
    src += src_stride;
286
0
    refs[0] += ref_stride;
287
0
    refs[1] += ref_stride;
288
0
    refs[2] += ref_stride;
289
0
    refs[3] += ref_stride;
290
0
  }
291
0
}
292
293
static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
294
    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
295
0
    int ref_stride, uint32_t sad_array[4], int n) {
296
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
297
0
  uint16_t *refs[4];
298
0
  __m256i sums_16[4];
299
0
  __m256i sums_32[4];
300
0
  const int height = VPXMIN(16, n);
301
0
  const int num_iters = n / height;
302
0
  int i;
303
304
0
  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
305
0
  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
306
0
  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
307
0
  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
308
0
  sums_32[0] = _mm256_setzero_si256();
309
0
  sums_32[1] = _mm256_setzero_si256();
310
0
  sums_32[2] = _mm256_setzero_si256();
311
0
  sums_32[3] = _mm256_setzero_si256();
312
313
0
  for (i = 0; i < num_iters; ++i) {
314
0
    sums_16[0] = _mm256_setzero_si256();
315
0
    sums_16[1] = _mm256_setzero_si256();
316
0
    sums_16[2] = _mm256_setzero_si256();
317
0
    sums_16[3] = _mm256_setzero_si256();
318
319
0
    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
320
321
    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
322
0
    sums_32[0] = _mm256_add_epi32(
323
0
        sums_32[0],
324
0
        _mm256_add_epi32(
325
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
326
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
327
0
    sums_32[1] = _mm256_add_epi32(
328
0
        sums_32[1],
329
0
        _mm256_add_epi32(
330
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
331
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
332
0
    sums_32[2] = _mm256_add_epi32(
333
0
        sums_32[2],
334
0
        _mm256_add_epi32(
335
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
336
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
337
0
    sums_32[3] = _mm256_add_epi32(
338
0
        sums_32[3],
339
0
        _mm256_add_epi32(
340
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
341
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
342
343
0
    src += src_stride << 4;
344
0
  }
345
0
  calc_final_4(sums_32, sad_array);
346
0
}
347
348
#define HIGHBD_SAD16XNX4D(n)                                                   \
349
  void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
350
                                      const uint8_t *const ref_array[4],       \
351
0
                                      int ref_stride, uint32_t sad_array[4]) { \
352
0
    highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
353
0
                           n);                                                 \
354
0
  }
355
356
#define HIGHBD_SADSKIP16XNx4D(n)                                             \
357
  void vpx_highbd_sad_skip_16x##n##x4d_avx2(                                 \
358
      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
359
0
      int ref_stride, uint32_t sad_array[4]) {                               \
360
0
    highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
361
0
                           sad_array, n / 2);                                \
362
0
    sad_array[0] <<= 1;                                                      \
363
0
    sad_array[1] <<= 1;                                                      \
364
0
    sad_array[2] <<= 1;                                                      \
365
0
    sad_array[3] <<= 1;                                                      \
366
0
  }
Unexecuted instantiation: vpx_highbd_sad_skip_16x32x4d_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_16x16x4d_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_16x8x4d_avx2
367
368
void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
369
                                 const uint8_t *const ref_array[4],
370
0
                                 int ref_stride, uint32_t sad_array[4]) {
371
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
372
0
  uint16_t *refs[4];
373
0
  __m256i sums_16[4];
374
375
0
  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
376
0
  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
377
0
  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
378
0
  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
379
0
  sums_16[0] = _mm256_setzero_si256();
380
0
  sums_16[1] = _mm256_setzero_si256();
381
0
  sums_16[2] = _mm256_setzero_si256();
382
0
  sums_16[3] = _mm256_setzero_si256();
383
384
0
  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
385
386
0
  {
387
0
    __m256i sums_32[4];
388
0
    sums_32[0] = _mm256_add_epi32(
389
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
390
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
391
0
    sums_32[1] = _mm256_add_epi32(
392
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
393
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
394
0
    sums_32[2] = _mm256_add_epi32(
395
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
396
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
397
0
    sums_32[3] = _mm256_add_epi32(
398
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
399
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
400
0
    calc_final_4(sums_32, sad_array);
401
0
  }
402
0
}
403
404
void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
405
                                const uint8_t *const ref_array[4],
406
0
                                int ref_stride, uint32_t sad_array[4]) {
407
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
408
0
  uint16_t *refs[4];
409
0
  __m256i sums_16[4];
410
411
0
  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
412
0
  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
413
0
  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
414
0
  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
415
0
  sums_16[0] = _mm256_setzero_si256();
416
0
  sums_16[1] = _mm256_setzero_si256();
417
0
  sums_16[2] = _mm256_setzero_si256();
418
0
  sums_16[3] = _mm256_setzero_si256();
419
420
0
  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
421
422
0
  {
423
0
    __m256i sums_32[4];
424
0
    sums_32[0] = _mm256_add_epi32(
425
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
426
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
427
0
    sums_32[1] = _mm256_add_epi32(
428
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
429
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
430
0
    sums_32[2] = _mm256_add_epi32(
431
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
432
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
433
0
    sums_32[3] = _mm256_add_epi32(
434
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
435
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
436
0
    calc_final_4(sums_32, sad_array);
437
0
  }
438
0
}
439
440
// clang-format off
441
HIGHBD_SAD64XNX4D(64)
442
HIGHBD_SADSKIP64XNx4D(64)
443
444
HIGHBD_SAD64XNX4D(32)
445
HIGHBD_SADSKIP64XNx4D(32)
446
447
HIGHBD_SAD32XNX4D(64)
448
HIGHBD_SADSKIP32XNx4D(64)
449
450
HIGHBD_SAD32XNX4D(32)
451
HIGHBD_SADSKIP32XNx4D(32)
452
453
HIGHBD_SAD32XNX4D(16)
454
HIGHBD_SADSKIP32XNx4D(16)
455
456
HIGHBD_SAD16XNX4D(32)
457
HIGHBD_SADSKIP16XNx4D(32)
458
459
HIGHBD_SADSKIP16XNx4D(16)
460
461
HIGHBD_SADSKIP16XNx4D(8)
462
    // clang-format on