Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <immintrin.h>
11
#include "./vpx_dsp_rtcd.h"
12
#include "vpx/vpx_integer.h"
13
14
0
static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
15
0
  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
16
0
  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
17
0
  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
18
0
                                    _mm256_extractf128_si256(t1, 1));
19
0
  return (unsigned int)_mm_cvtsi128_si32(sum);
20
0
}
21
22
static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
23
                                            const uint16_t *src, int src_stride,
24
                                            uint16_t *ref, int ref_stride,
25
0
                                            int height) {
26
0
  int i;
27
0
  for (i = 0; i < height; ++i) {
28
    // load src and all ref[]
29
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
30
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
31
0
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
32
0
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
33
0
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
34
0
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
35
0
    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
36
0
    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
37
    // absolute differences between every ref[] to src
38
0
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
39
0
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
40
0
    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
41
0
    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
42
    // sum every abs diff
43
0
    *sums_16 =
44
0
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
45
0
    *sums_16 =
46
0
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
47
48
0
    src += src_stride;
49
0
    ref += ref_stride;
50
0
  }
51
0
}
52
53
static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
54
                                                         int src_stride,
55
                                                         const uint8_t *ref_ptr,
56
                                                         int ref_stride,
57
0
                                                         int n) {
58
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
59
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
60
0
  __m256i sums_32 = _mm256_setzero_si256();
61
0
  int i;
62
63
0
  for (i = 0; i < (n / 2); ++i) {
64
0
    __m256i sums_16 = _mm256_setzero_si256();
65
66
0
    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
67
68
    /* sums_16 will outrange after 2 rows, so add current sums_16 to
69
     * sums_32*/
70
0
    sums_32 = _mm256_add_epi32(
71
0
        sums_32,
72
0
        _mm256_add_epi32(
73
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
74
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
75
76
0
    src += src_stride << 1;
77
0
    ref += ref_stride << 1;
78
0
  }
79
0
  return calc_final(sums_32);
80
0
}
81
82
#define HIGHBD_SAD64XN(n)                                                      \
83
  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
84
                                           const uint8_t *ref,                 \
85
0
                                           int ref_stride) {                   \
86
0
    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
87
0
  }
Unexecuted instantiation: vpx_highbd_sad64x64_avx2
Unexecuted instantiation: vpx_highbd_sad64x32_avx2
88
89
#define HIGHBD_SADSKIP64xN(n)                                                \
90
  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
91
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
92
0
      int ref_stride) {                                                      \
93
0
    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
94
0
                                   n / 2);                                   \
95
0
  }
Unexecuted instantiation: vpx_highbd_sad_skip_64x64_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_64x32_avx2
96
97
static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
98
                                            const uint16_t *src, int src_stride,
99
                                            uint16_t *ref, int ref_stride,
100
0
                                            int height) {
101
0
  int i;
102
0
  for (i = 0; i < height; ++i) {
103
    // load src and all ref[]
104
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
105
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
106
0
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
107
0
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
108
    // absolute differences between every ref[] to src
109
0
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
110
0
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
111
    // sum every abs diff
112
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
113
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
114
115
0
    src += src_stride;
116
0
    ref += ref_stride;
117
0
  }
118
0
}
119
120
static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
121
                                                         int src_stride,
122
                                                         const uint8_t *ref_ptr,
123
                                                         int ref_stride,
124
0
                                                         int n) {
125
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
126
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
127
0
  __m256i sums_32 = _mm256_setzero_si256();
128
0
  int i;
129
130
0
  for (i = 0; i < (n / 8); ++i) {
131
0
    __m256i sums_16 = _mm256_setzero_si256();
132
133
0
    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
134
135
    /* sums_16 will outrange after 8 rows, so add current sums_16 to
136
     * sums_32*/
137
0
    sums_32 = _mm256_add_epi32(
138
0
        sums_32,
139
0
        _mm256_add_epi32(
140
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
141
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
142
143
0
    src += src_stride << 3;
144
0
    ref += ref_stride << 3;
145
0
  }
146
0
  return calc_final(sums_32);
147
0
}
148
149
#define HIGHBD_SAD32XN(n)                                                      \
150
  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
151
                                           const uint8_t *ref,                 \
152
0
                                           int ref_stride) {                   \
153
0
    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
154
0
  }
Unexecuted instantiation: vpx_highbd_sad32x64_avx2
Unexecuted instantiation: vpx_highbd_sad32x32_avx2
Unexecuted instantiation: vpx_highbd_sad32x16_avx2
155
156
#define HIGHBD_SADSKIP32xN(n)                                                \
157
  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
158
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
159
0
      int ref_stride) {                                                      \
160
0
    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
161
0
                                   n / 2);                                   \
162
0
  }
Unexecuted instantiation: vpx_highbd_sad_skip_32x64_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_32x32_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_32x16_avx2
163
164
static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
165
                                            const uint16_t *src, int src_stride,
166
                                            uint16_t *ref, int ref_stride,
167
0
                                            int height) {
168
0
  int i;
169
0
  for (i = 0; i < height; i += 2) {
170
    // load src and all ref[]
171
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
172
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
173
0
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
174
0
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
175
    // absolute differences between every ref[] to src
176
0
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
177
0
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
178
    // sum every abs diff
179
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
180
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
181
182
0
    src += src_stride << 1;
183
0
    ref += ref_stride << 1;
184
0
  }
185
0
}
186
187
static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
188
                                                         int src_stride,
189
                                                         const uint8_t *ref_ptr,
190
                                                         int ref_stride,
191
0
                                                         int n) {
192
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
193
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
194
0
  __m256i sums_32 = _mm256_setzero_si256();
195
0
  const int height = VPXMIN(16, n);
196
0
  const int num_iters = n / height;
197
0
  int i;
198
199
0
  for (i = 0; i < num_iters; ++i) {
200
0
    __m256i sums_16 = _mm256_setzero_si256();
201
202
0
    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
203
204
    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
205
0
    sums_32 = _mm256_add_epi32(
206
0
        sums_32,
207
0
        _mm256_add_epi32(
208
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
209
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
210
211
0
    src += src_stride << 4;
212
0
    ref += ref_stride << 4;
213
0
  }
214
0
  return calc_final(sums_32);
215
0
}
216
217
#define HIGHBD_SAD16XN(n)                                                      \
218
  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
219
                                           const uint8_t *ref,                 \
220
0
                                           int ref_stride) {                   \
221
0
    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
222
0
  }
223
224
#define HIGHBD_SADSKIP16xN(n)                                                \
225
  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
226
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
227
0
      int ref_stride) {                                                      \
228
0
    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
229
0
                                   n / 2);                                   \
230
0
  }
Unexecuted instantiation: vpx_highbd_sad_skip_16x32_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_16x16_avx2
Unexecuted instantiation: vpx_highbd_sad_skip_16x8_avx2
231
232
unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
233
0
                                      const uint8_t *ref_ptr, int ref_stride) {
234
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
235
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
236
0
  __m256i sums_16 = _mm256_setzero_si256();
237
238
0
  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
239
240
0
  {
241
0
    const __m256i sums_32 = _mm256_add_epi32(
242
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
243
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
244
0
    return calc_final(sums_32);
245
0
  }
246
0
}
247
248
unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
249
0
                                     const uint8_t *ref_ptr, int ref_stride) {
250
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
251
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
252
0
  __m256i sums_16 = _mm256_setzero_si256();
253
254
0
  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
255
256
0
  {
257
0
    const __m256i sums_32 = _mm256_add_epi32(
258
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
259
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
260
0
    return calc_final(sums_32);
261
0
  }
262
0
}
263
264
// clang-format off
265
HIGHBD_SAD64XN(64)
266
HIGHBD_SADSKIP64xN(64)
267
HIGHBD_SAD64XN(32)
268
HIGHBD_SADSKIP64xN(32)
269
HIGHBD_SAD32XN(64)
270
HIGHBD_SADSKIP32xN(64)
271
HIGHBD_SAD32XN(32)
272
HIGHBD_SADSKIP32xN(32)
273
HIGHBD_SAD32XN(16)
274
HIGHBD_SADSKIP32xN(16)
275
HIGHBD_SAD16XN(32)
276
HIGHBD_SADSKIP16xN(32)
277
HIGHBD_SADSKIP16xN(16)
278
HIGHBD_SADSKIP16xN(8)
279
//clang-format on
280
281
// AVG -------------------------------------------------------------------------
282
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
283
                                                const uint16_t *src,
284
                                                int src_stride, uint16_t *ref,
285
                                                int ref_stride, uint16_t *sec,
286
0
                                                int height) {
287
0
  int i;
288
0
  for (i = 0; i < height; ++i) {
289
    // load src and all ref[]
290
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
291
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
292
0
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
293
0
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
294
0
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
295
0
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
296
0
    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
297
0
    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
298
0
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
299
0
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
300
0
    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
301
0
    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
302
0
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
303
0
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
304
0
    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
305
0
    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
306
    // absolute differences between every ref/pred avg to src
307
0
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
308
0
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
309
0
    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
310
0
    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
311
    // sum every abs diff
312
0
    *sums_16 =
313
0
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
314
0
    *sums_16 =
315
0
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
316
317
0
    src += src_stride;
318
0
    ref += ref_stride;
319
0
    sec += 64;
320
0
  }
321
0
}
322
323
#define HIGHBD_SAD64XN_AVG(n)                                                 \
324
  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
325
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
326
0
      int ref_stride, const uint8_t *second_pred) {                           \
327
0
    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
328
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
329
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
330
0
    __m256i sums_32 = _mm256_setzero_si256();                                 \
331
0
    int i;                                                                    \
332
0
                                                                              \
333
0
    for (i = 0; i < (n / 2); ++i) {                                           \
334
0
      __m256i sums_16 = _mm256_setzero_si256();                               \
335
0
                                                                              \
336
0
      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
337
0
                                                                              \
338
0
      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
339
0
       * sums_32*/                                                            \
340
0
      sums_32 = _mm256_add_epi32(                                             \
341
0
          sums_32,                                                            \
342
0
          _mm256_add_epi32(                                                   \
343
0
              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
344
0
              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
345
0
                                                                              \
346
0
      src += src_stride << 1;                                                 \
347
0
      ref += ref_stride << 1;                                                 \
348
0
      sec += 64 << 1;                                                         \
349
0
    }                                                                         \
350
0
    return calc_final(sums_32);                                               \
351
0
  }
Unexecuted instantiation: vpx_highbd_sad64x64_avg_avx2
Unexecuted instantiation: vpx_highbd_sad64x32_avg_avx2
352
353
// 64x64
354
HIGHBD_SAD64XN_AVG(64)
355
356
// 64x32
357
HIGHBD_SAD64XN_AVG(32)
358
359
static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
360
                                                const uint16_t *src,
361
                                                int src_stride, uint16_t *ref,
362
                                                int ref_stride, uint16_t *sec,
363
0
                                                int height) {
364
0
  int i;
365
0
  for (i = 0; i < height; ++i) {
366
    // load src and all ref[]
367
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
368
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
369
0
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
370
0
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
371
0
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
372
0
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
373
0
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
374
0
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
375
    // absolute differences between every ref/pred avg to src
376
0
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
377
0
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
378
    // sum every abs diff
379
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
380
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
381
382
0
    src += src_stride;
383
0
    ref += ref_stride;
384
0
    sec += 32;
385
0
  }
386
0
}
387
388
#define HIGHBD_SAD32XN_AVG(n)                                                 \
389
  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
390
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
391
0
      int ref_stride, const uint8_t *second_pred) {                           \
392
0
    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
393
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
394
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
395
0
    __m256i sums_32 = _mm256_setzero_si256();                                 \
396
0
    int i;                                                                    \
397
0
                                                                              \
398
0
    for (i = 0; i < (n / 8); ++i) {                                           \
399
0
      __m256i sums_16 = _mm256_setzero_si256();                               \
400
0
                                                                              \
401
0
      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
402
0
                                                                              \
403
0
      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
404
0
       * sums_32*/                                                            \
405
0
      sums_32 = _mm256_add_epi32(                                             \
406
0
          sums_32,                                                            \
407
0
          _mm256_add_epi32(                                                   \
408
0
              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
409
0
              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
410
0
                                                                              \
411
0
      src += src_stride << 3;                                                 \
412
0
      ref += ref_stride << 3;                                                 \
413
0
      sec += 32 << 3;                                                         \
414
0
    }                                                                         \
415
0
    return calc_final(sums_32);                                               \
416
0
  }
Unexecuted instantiation: vpx_highbd_sad32x64_avg_avx2
Unexecuted instantiation: vpx_highbd_sad32x32_avg_avx2
Unexecuted instantiation: vpx_highbd_sad32x16_avg_avx2
417
418
// 32x64
419
HIGHBD_SAD32XN_AVG(64)
420
421
// 32x32
422
HIGHBD_SAD32XN_AVG(32)
423
424
// 32x16
425
HIGHBD_SAD32XN_AVG(16)
426
427
static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
428
                                                const uint16_t *src,
429
                                                int src_stride, uint16_t *ref,
430
                                                int ref_stride, uint16_t *sec,
431
0
                                                int height) {
432
0
  int i;
433
0
  for (i = 0; i < height; i += 2) {
434
    // load src and all ref[]
435
0
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
436
0
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
437
0
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
438
0
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
439
0
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
440
0
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
441
0
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
442
0
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
443
    // absolute differences between every ref[] to src
444
0
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
445
0
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
446
    // sum every abs diff
447
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
448
0
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
449
450
0
    src += src_stride << 1;
451
0
    ref += ref_stride << 1;
452
0
    sec += 32;
453
0
  }
454
0
}
455
456
unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
457
                                          int src_stride,
458
                                          const uint8_t *ref_ptr,
459
                                          int ref_stride,
460
0
                                          const uint8_t *second_pred) {
461
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
462
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
463
0
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
464
0
  __m256i sums_32 = _mm256_setzero_si256();
465
0
  int i;
466
467
0
  for (i = 0; i < 2; ++i) {
468
0
    __m256i sums_16 = _mm256_setzero_si256();
469
470
0
    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
471
472
    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
473
0
    sums_32 = _mm256_add_epi32(
474
0
        sums_32,
475
0
        _mm256_add_epi32(
476
0
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
477
0
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
478
479
0
    src += src_stride << 4;
480
0
    ref += ref_stride << 4;
481
0
    sec += 16 << 4;
482
0
  }
483
0
  return calc_final(sums_32);
484
0
}
485
486
unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
487
                                          int src_stride,
488
                                          const uint8_t *ref_ptr,
489
                                          int ref_stride,
490
0
                                          const uint8_t *second_pred) {
491
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
492
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
493
0
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
494
0
  __m256i sums_16 = _mm256_setzero_si256();
495
496
0
  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
497
498
0
  {
499
0
    const __m256i sums_32 = _mm256_add_epi32(
500
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
501
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
502
0
    return calc_final(sums_32);
503
0
  }
504
0
}
505
506
unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
507
                                         const uint8_t *ref_ptr, int ref_stride,
508
0
                                         const uint8_t *second_pred) {
509
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
510
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
511
0
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
512
0
  __m256i sums_16 = _mm256_setzero_si256();
513
514
0
  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
515
516
0
  {
517
0
    const __m256i sums_32 = _mm256_add_epi32(
518
0
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
519
0
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
520
0
    return calc_final(sums_32);
521
0
  }
522
0
}