Coverage Report

Created: 2026-06-15 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/sad_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <immintrin.h>
11
#include "./vpx_dsp_rtcd.h"
12
#include "vpx_ports/mem.h"
13
14
static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
15
                                        const uint8_t *ref_ptr, int ref_stride,
16
5.44M
                                        int h) {
17
5.44M
  int i, res;
18
5.44M
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
19
5.44M
  __m256i sum_sad = _mm256_setzero_si256();
20
5.44M
  __m256i sum_sad_h;
21
5.44M
  __m128i sum_sad128;
22
195M
  for (i = 0; i < h; i++) {
23
190M
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
24
190M
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
25
190M
    sad1_reg =
26
190M
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
27
190M
    sad2_reg = _mm256_sad_epu8(
28
190M
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
29
190M
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
30
190M
    ref_ptr += ref_stride;
31
190M
    src_ptr += src_stride;
32
190M
  }
33
5.44M
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
34
5.44M
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
35
5.44M
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
36
5.44M
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
37
5.44M
  res = _mm_cvtsi128_si32(sum_sad128);
38
5.44M
  return res;
39
5.44M
}
40
41
static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
42
                                        const uint8_t *ref_ptr, int ref_stride,
43
17.2M
                                        int h) {
44
17.2M
  int i, res;
45
17.2M
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
46
17.2M
  __m256i sum_sad = _mm256_setzero_si256();
47
17.2M
  __m256i sum_sad_h;
48
17.2M
  __m128i sum_sad128;
49
17.2M
  const int ref2_stride = ref_stride << 1;
50
17.2M
  const int src2_stride = src_stride << 1;
51
17.2M
  const int max = h >> 1;
52
208M
  for (i = 0; i < max; i++) {
53
191M
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
54
191M
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
55
191M
    sad1_reg =
56
191M
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
57
191M
    sad2_reg = _mm256_sad_epu8(
58
191M
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
59
191M
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
60
191M
    ref_ptr += ref2_stride;
61
191M
    src_ptr += src2_stride;
62
191M
  }
63
17.2M
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
64
17.2M
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
65
17.2M
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
66
17.2M
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
67
17.2M
  res = _mm_cvtsi128_si32(sum_sad128);
68
17.2M
  return res;
69
17.2M
}
70
71
#define FSAD64_H(h)                                                           \
72
  unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
73
2.59M
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
2.59M
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
2.59M
  }
vpx_sad64x64_avx2
Line
Count
Source
73
1.42M
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
1.42M
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
1.42M
  }
vpx_sad64x32_avx2
Line
Count
Source
73
1.16M
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
1.16M
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
1.16M
  }
76
77
#define FSADS64_H(h)                                                          \
78
  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
79
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
80
2.85M
      int ref_stride) {                                                       \
81
2.85M
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
2.85M
                            h / 2);                                           \
83
2.85M
  }
vpx_sad_skip_64x64_avx2
Line
Count
Source
80
998k
      int ref_stride) {                                                       \
81
998k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
998k
                            h / 2);                                           \
83
998k
  }
vpx_sad_skip_64x32_avx2
Line
Count
Source
80
1.85M
      int ref_stride) {                                                       \
81
1.85M
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
1.85M
                            h / 2);                                           \
83
1.85M
  }
84
85
#define FSAD32_H(h)                                                           \
86
  unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
87
8.41M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
8.41M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
8.41M
  }
vpx_sad32x64_avx2
Line
Count
Source
87
1.97M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
1.97M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
1.97M
  }
vpx_sad32x32_avx2
Line
Count
Source
87
2.81M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
2.81M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
2.81M
  }
vpx_sad32x16_avx2
Line
Count
Source
87
3.62M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
3.62M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
3.62M
  }
90
91
#define FSADS32_H(h)                                                          \
92
  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
93
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
94
8.80M
      int ref_stride) {                                                       \
95
8.80M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
8.80M
                            h / 2);                                           \
97
8.80M
  }
vpx_sad_skip_32x64_avx2
Line
Count
Source
94
840k
      int ref_stride) {                                                       \
95
840k
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
840k
                            h / 2);                                           \
97
840k
  }
vpx_sad_skip_32x32_avx2
Line
Count
Source
94
2.22M
      int ref_stride) {                                                       \
95
2.22M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
2.22M
                            h / 2);                                           \
97
2.22M
  }
vpx_sad_skip_32x16_avx2
Line
Count
Source
94
5.73M
      int ref_stride) {                                                       \
95
5.73M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
5.73M
                            h / 2);                                           \
97
5.73M
  }
98
99
#if CONFIG_ENCODERS
100
#define FSAD64  \
101
  FSAD64_H(64)  \
102
  FSAD64_H(32)  \
103
  FSADS64_H(64) \
104
  FSADS64_H(32)
105
106
#define FSAD32  \
107
  FSAD32_H(64)  \
108
  FSAD32_H(32)  \
109
  FSAD32_H(16)  \
110
  FSADS32_H(64) \
111
  FSADS32_H(32) \
112
  FSADS32_H(16)
113
#else  // !CONFIG_ENCODERS
114
#define FSAD64 FSAD64_H(64)
115
#define FSAD32 FSAD32_H(32)
116
#endif  // CONFIG_ENCODERS
117
118
FSAD64
119
FSAD32
120
121
#undef FSAD64
122
#undef FSAD32
123
#undef FSAD64_H
124
#undef FSAD32_H
125
#undef FSADS64_H
126
#undef FSADS32_H
127
128
#if CONFIG_ENCODERS
129
#define FSADAVG64_H(h)                                                        \
130
  unsigned int vpx_sad64x##h##_avg_avx2(                                      \
131
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
132
0
      int ref_stride, const uint8_t *second_pred) {                           \
133
0
    int i;                                                                    \
134
0
    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
135
0
    __m256i sum_sad = _mm256_setzero_si256();                                 \
136
0
    __m256i sum_sad_h;                                                        \
137
0
    __m128i sum_sad128;                                                       \
138
0
    for (i = 0; i < h; i++) {                                                 \
139
0
      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
140
0
      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
141
0
      ref1_reg = _mm256_avg_epu8(                                             \
142
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
143
0
      ref2_reg = _mm256_avg_epu8(                                             \
144
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
145
0
      sad1_reg = _mm256_sad_epu8(                                             \
146
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
147
0
      sad2_reg = _mm256_sad_epu8(                                             \
148
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
149
0
      sum_sad =                                                               \
150
0
          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
151
0
      ref_ptr += ref_stride;                                                  \
152
0
      src_ptr += src_stride;                                                  \
153
0
      second_pred += 64;                                                      \
154
0
    }                                                                         \
155
0
    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
156
0
    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
157
0
    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
158
0
    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
159
0
    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
160
0
  }
161
162
#define FSADAVG32_H(h)                                                        \
163
  unsigned int vpx_sad32x##h##_avg_avx2(                                      \
164
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
165
0
      int ref_stride, const uint8_t *second_pred) {                           \
166
0
    int i;                                                                    \
167
0
    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
168
0
    __m256i sum_sad = _mm256_setzero_si256();                                 \
169
0
    __m256i sum_sad_h;                                                        \
170
0
    __m128i sum_sad128;                                                       \
171
0
    int ref2_stride = ref_stride << 1;                                        \
172
0
    int src2_stride = src_stride << 1;                                        \
173
0
    int max = h >> 1;                                                         \
174
0
    for (i = 0; i < max; i++) {                                               \
175
0
      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
176
0
      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
177
0
      ref1_reg = _mm256_avg_epu8(                                             \
178
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
179
0
      ref2_reg = _mm256_avg_epu8(                                             \
180
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
181
0
      sad1_reg = _mm256_sad_epu8(                                             \
182
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
183
0
      sad2_reg = _mm256_sad_epu8(                                             \
184
0
          ref2_reg,                                                           \
185
0
          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
186
0
      sum_sad =                                                               \
187
0
          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
188
0
      ref_ptr += ref2_stride;                                                 \
189
0
      src_ptr += src2_stride;                                                 \
190
0
      second_pred += 64;                                                      \
191
0
    }                                                                         \
192
0
    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
193
0
    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
194
0
    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
195
0
    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
196
0
    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
197
0
  }
198
199
#define FSADAVG64 \
200
  FSADAVG64_H(64) \
201
  FSADAVG64_H(32)
202
203
#define FSADAVG32 \
204
  FSADAVG32_H(64) \
205
  FSADAVG32_H(32) \
206
  FSADAVG32_H(16)
207
208
0
FSADAVG64
Unexecuted instantiation: vpx_sad64x64_avg_avx2
Unexecuted instantiation: vpx_sad64x32_avg_avx2
209
FSADAVG32
Unexecuted instantiation: vpx_sad32x64_avg_avx2
Unexecuted instantiation: vpx_sad32x32_avg_avx2
Unexecuted instantiation: vpx_sad32x16_avg_avx2
210
211
#undef FSADAVG64
212
#undef FSADAVG32
213
#undef FSADAVG64_H
214
#undef FSADAVG32_H
215
216
#endif  // CONFIG_ENCODERS