Coverage Report

Created: 2026-02-14 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/sad_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <immintrin.h>
11
#include "./vpx_dsp_rtcd.h"
12
#include "vpx_ports/mem.h"
13
14
static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
15
                                        const uint8_t *ref_ptr, int ref_stride,
16
2.08M
                                        int h) {
17
2.08M
  int i, res;
18
2.08M
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
19
2.08M
  __m256i sum_sad = _mm256_setzero_si256();
20
2.08M
  __m256i sum_sad_h;
21
2.08M
  __m128i sum_sad128;
22
78.3M
  for (i = 0; i < h; i++) {
23
76.2M
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
24
76.2M
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
25
76.2M
    sad1_reg =
26
76.2M
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
27
76.2M
    sad2_reg = _mm256_sad_epu8(
28
76.2M
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
29
76.2M
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
30
76.2M
    ref_ptr += ref_stride;
31
76.2M
    src_ptr += src_stride;
32
76.2M
  }
33
2.08M
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
34
2.08M
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
35
2.08M
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
36
2.08M
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
37
2.08M
  res = _mm_cvtsi128_si32(sum_sad128);
38
2.08M
  return res;
39
2.08M
}
40
41
static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
42
                                        const uint8_t *ref_ptr, int ref_stride,
43
11.1M
                                        int h) {
44
11.1M
  int i, res;
45
11.1M
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
46
11.1M
  __m256i sum_sad = _mm256_setzero_si256();
47
11.1M
  __m256i sum_sad_h;
48
11.1M
  __m128i sum_sad128;
49
11.1M
  const int ref2_stride = ref_stride << 1;
50
11.1M
  const int src2_stride = src_stride << 1;
51
11.1M
  const int max = h >> 1;
52
145M
  for (i = 0; i < max; i++) {
53
134M
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
54
134M
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
55
134M
    sad1_reg =
56
134M
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
57
134M
    sad2_reg = _mm256_sad_epu8(
58
134M
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
59
134M
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
60
134M
    ref_ptr += ref2_stride;
61
134M
    src_ptr += src2_stride;
62
134M
  }
63
11.1M
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
64
11.1M
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
65
11.1M
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
66
11.1M
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
67
11.1M
  res = _mm_cvtsi128_si32(sum_sad128);
68
11.1M
  return res;
69
11.1M
}
70
71
#define FSAD64_H(h)                                                           \
72
  unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
73
1.41M
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
1.41M
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
1.41M
  }
vpx_sad64x64_avx2
Line
Count
Source
73
452k
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
452k
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
452k
  }
vpx_sad64x32_avx2
Line
Count
Source
73
958k
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
958k
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
958k
  }
76
77
#define FSADS64_H(h)                                                          \
78
  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
79
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
80
672k
      int ref_stride) {                                                       \
81
672k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
672k
                            h / 2);                                           \
83
672k
  }
vpx_sad_skip_64x64_avx2
Line
Count
Source
80
369k
      int ref_stride) {                                                       \
81
369k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
369k
                            h / 2);                                           \
83
369k
  }
vpx_sad_skip_64x32_avx2
Line
Count
Source
80
302k
      int ref_stride) {                                                       \
81
302k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
302k
                            h / 2);                                           \
83
302k
  }
84
85
#define FSAD32_H(h)                                                           \
86
  unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
87
7.87M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
7.87M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
7.87M
  }
vpx_sad32x64_avx2
Line
Count
Source
87
800k
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
800k
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
800k
  }
vpx_sad32x32_avx2
Line
Count
Source
87
3.72M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
3.72M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
3.72M
  }
vpx_sad32x16_avx2
Line
Count
Source
87
3.35M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
3.35M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
3.35M
  }
90
91
#define FSADS32_H(h)                                                          \
92
  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
93
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
94
3.23M
      int ref_stride) {                                                       \
95
3.23M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
3.23M
                            h / 2);                                           \
97
3.23M
  }
vpx_sad_skip_32x64_avx2
Line
Count
Source
94
80.7k
      int ref_stride) {                                                       \
95
80.7k
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
80.7k
                            h / 2);                                           \
97
80.7k
  }
vpx_sad_skip_32x32_avx2
Line
Count
Source
94
2.24M
      int ref_stride) {                                                       \
95
2.24M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
2.24M
                            h / 2);                                           \
97
2.24M
  }
vpx_sad_skip_32x16_avx2
Line
Count
Source
94
911k
      int ref_stride) {                                                       \
95
911k
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
911k
                            h / 2);                                           \
97
911k
  }
98
99
#define FSAD64  \
100
  FSAD64_H(64)  \
101
  FSAD64_H(32)  \
102
  FSADS64_H(64) \
103
  FSADS64_H(32)
104
105
#define FSAD32  \
106
  FSAD32_H(64)  \
107
  FSAD32_H(32)  \
108
  FSAD32_H(16)  \
109
  FSADS32_H(64) \
110
  FSADS32_H(32) \
111
  FSADS32_H(16)
112
113
FSAD64
114
FSAD32
115
116
#undef FSAD64
117
#undef FSAD32
118
#undef FSAD64_H
119
#undef FSAD32_H
120
#undef FSADS64_H
121
#undef FSADS32_H
122
123
#define FSADAVG64_H(h)                                                        \
124
  unsigned int vpx_sad64x##h##_avg_avx2(                                      \
125
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
126
0
      int ref_stride, const uint8_t *second_pred) {                           \
127
0
    int i;                                                                    \
128
0
    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
129
0
    __m256i sum_sad = _mm256_setzero_si256();                                 \
130
0
    __m256i sum_sad_h;                                                        \
131
0
    __m128i sum_sad128;                                                       \
132
0
    for (i = 0; i < h; i++) {                                                 \
133
0
      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
134
0
      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
135
0
      ref1_reg = _mm256_avg_epu8(                                             \
136
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
137
0
      ref2_reg = _mm256_avg_epu8(                                             \
138
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
139
0
      sad1_reg = _mm256_sad_epu8(                                             \
140
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
141
0
      sad2_reg = _mm256_sad_epu8(                                             \
142
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
143
0
      sum_sad =                                                               \
144
0
          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
145
0
      ref_ptr += ref_stride;                                                  \
146
0
      src_ptr += src_stride;                                                  \
147
0
      second_pred += 64;                                                      \
148
0
    }                                                                         \
149
0
    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
150
0
    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
151
0
    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
152
0
    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
153
0
    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
154
0
  }
155
156
#define FSADAVG32_H(h)                                                        \
157
  unsigned int vpx_sad32x##h##_avg_avx2(                                      \
158
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
159
0
      int ref_stride, const uint8_t *second_pred) {                           \
160
0
    int i;                                                                    \
161
0
    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
162
0
    __m256i sum_sad = _mm256_setzero_si256();                                 \
163
0
    __m256i sum_sad_h;                                                        \
164
0
    __m128i sum_sad128;                                                       \
165
0
    int ref2_stride = ref_stride << 1;                                        \
166
0
    int src2_stride = src_stride << 1;                                        \
167
0
    int max = h >> 1;                                                         \
168
0
    for (i = 0; i < max; i++) {                                               \
169
0
      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
170
0
      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
171
0
      ref1_reg = _mm256_avg_epu8(                                             \
172
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
173
0
      ref2_reg = _mm256_avg_epu8(                                             \
174
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
175
0
      sad1_reg = _mm256_sad_epu8(                                             \
176
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
177
0
      sad2_reg = _mm256_sad_epu8(                                             \
178
0
          ref2_reg,                                                           \
179
0
          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
180
0
      sum_sad =                                                               \
181
0
          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
182
0
      ref_ptr += ref2_stride;                                                 \
183
0
      src_ptr += src2_stride;                                                 \
184
0
      second_pred += 64;                                                      \
185
0
    }                                                                         \
186
0
    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
187
0
    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
188
0
    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
189
0
    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
190
0
    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
191
0
  }
192
193
#define FSADAVG64 \
194
  FSADAVG64_H(64) \
195
  FSADAVG64_H(32)
196
197
#define FSADAVG32 \
198
  FSADAVG32_H(64) \
199
  FSADAVG32_H(32) \
200
  FSADAVG32_H(16)
201
202
0
FSADAVG64
Unexecuted instantiation: vpx_sad64x64_avg_avx2
Unexecuted instantiation: vpx_sad64x32_avg_avx2
203
FSADAVG32
Unexecuted instantiation: vpx_sad32x64_avg_avx2
Unexecuted instantiation: vpx_sad32x32_avg_avx2
Unexecuted instantiation: vpx_sad32x16_avg_avx2
204
205
#undef FSADAVG64
206
#undef FSADAVG32
207
#undef FSADAVG64_H
208
#undef FSADAVG32_H