Coverage Report

Created: 2026-02-14 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/sad4d_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <immintrin.h>  // AVX2
11
#include "./vpx_dsp_rtcd.h"
12
#include "vpx/vpx_integer.h"
13
14
// Note with sums[4] some versions of Visual Studio may fail due to parameter
15
// alignment, though the functions should be equivalent:
16
// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
17
// aligned
18
static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
19
8.89M
                                uint32_t sad_array[4]) {
20
8.89M
  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
21
8.89M
  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
22
8.89M
  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
23
8.89M
  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
24
8.89M
                                    _mm256_extractf128_si256(t2, 1));
25
8.89M
  _mm_storeu_si128((__m128i *)sad_array, sum);
26
8.89M
}
27
28
static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
29
                                   const uint8_t *const ref_array[4],
30
                                   int ref_stride, int h,
31
6.84M
                                   uint32_t sad_array[4]) {
32
6.84M
  int i;
33
6.84M
  const uint8_t *refs[4];
34
6.84M
  __m256i sums[4];
35
36
6.84M
  refs[0] = ref_array[0];
37
6.84M
  refs[1] = ref_array[1];
38
6.84M
  refs[2] = ref_array[2];
39
6.84M
  refs[3] = ref_array[3];
40
6.84M
  sums[0] = _mm256_setzero_si256();
41
6.84M
  sums[1] = _mm256_setzero_si256();
42
6.84M
  sums[2] = _mm256_setzero_si256();
43
6.84M
  sums[3] = _mm256_setzero_si256();
44
45
171M
  for (i = 0; i < h; i++) {
46
164M
    __m256i r[4];
47
48
    // load src and all ref[]
49
164M
    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
50
164M
    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
51
164M
    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
52
164M
    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
53
164M
    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
54
55
    // sum of the absolute differences between every ref[] to src
56
164M
    r[0] = _mm256_sad_epu8(r[0], s);
57
164M
    r[1] = _mm256_sad_epu8(r[1], s);
58
164M
    r[2] = _mm256_sad_epu8(r[2], s);
59
164M
    r[3] = _mm256_sad_epu8(r[3], s);
60
61
    // sum every ref[]
62
164M
    sums[0] = _mm256_add_epi32(sums[0], r[0]);
63
164M
    sums[1] = _mm256_add_epi32(sums[1], r[1]);
64
164M
    sums[2] = _mm256_add_epi32(sums[2], r[2]);
65
164M
    sums[3] = _mm256_add_epi32(sums[3], r[3]);
66
67
164M
    src_ptr += src_stride;
68
164M
    refs[0] += ref_stride;
69
164M
    refs[1] += ref_stride;
70
164M
    refs[2] += ref_stride;
71
164M
    refs[3] += ref_stride;
72
164M
  }
73
74
6.84M
  calc_final_4(sums, sad_array);
75
6.84M
}
76
77
static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
78
                                   const uint8_t *const ref_array[4],
79
                                   int ref_stride, int h,
80
2.05M
                                   uint32_t sad_array[4]) {
81
2.05M
  __m256i sums[4];
82
2.05M
  int i;
83
2.05M
  const uint8_t *refs[4];
84
85
2.05M
  refs[0] = ref_array[0];
86
2.05M
  refs[1] = ref_array[1];
87
2.05M
  refs[2] = ref_array[2];
88
2.05M
  refs[3] = ref_array[3];
89
2.05M
  sums[0] = _mm256_setzero_si256();
90
2.05M
  sums[1] = _mm256_setzero_si256();
91
2.05M
  sums[2] = _mm256_setzero_si256();
92
2.05M
  sums[3] = _mm256_setzero_si256();
93
94
96.5M
  for (i = 0; i < h; i++) {
95
94.4M
    __m256i r_lo[4], r_hi[4];
96
    // load 64 bytes from src and all ref[]
97
94.4M
    const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
98
94.4M
    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
99
94.4M
    r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
100
94.4M
    r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
101
94.4M
    r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
102
94.4M
    r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
103
94.4M
    r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
104
94.4M
    r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
105
94.4M
    r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
106
94.4M
    r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
107
108
    // sum of the absolute differences between every ref[] to src
109
94.4M
    r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
110
94.4M
    r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
111
94.4M
    r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
112
94.4M
    r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
113
94.4M
    r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
114
94.4M
    r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
115
94.4M
    r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
116
94.4M
    r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
117
118
    // sum every ref[]
119
94.4M
    sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
120
94.4M
    sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
121
94.4M
    sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
122
94.4M
    sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
123
94.4M
    sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
124
94.4M
    sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
125
94.4M
    sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
126
94.4M
    sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
127
128
94.4M
    src_ptr += src_stride;
129
94.4M
    refs[0] += ref_stride;
130
94.4M
    refs[1] += ref_stride;
131
94.4M
    refs[2] += ref_stride;
132
94.4M
    refs[3] += ref_stride;
133
94.4M
  }
134
135
2.05M
  calc_final_4(sums, sad_array);
136
2.05M
}
137
138
#define SAD64_H(h)                                                         \
139
  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
140
                               const uint8_t *const ref_array[4],          \
141
1.05M
                               int ref_stride, uint32_t sad_array[4]) {    \
142
1.05M
    sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
143
1.05M
  }
144
145
#define SAD32_H(h)                                                         \
146
  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
147
                               const uint8_t *const ref_array[4],          \
148
3.80M
                               int ref_stride, uint32_t sad_array[4]) {    \
149
3.80M
    sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
150
3.80M
  }
151
152
SAD64_H(64)
153
SAD32_H(32)
154
155
#define SADS64_H(h)                                                           \
156
  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
157
                                     const uint8_t *const ref_array[4],       \
158
1.00M
                                     int ref_stride, uint32_t sad_array[4]) { \
159
1.00M
    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
160
1.00M
                    ((h) >> 1), sad_array);                                   \
161
1.00M
    sad_array[0] <<= 1;                                                       \
162
1.00M
    sad_array[1] <<= 1;                                                       \
163
1.00M
    sad_array[2] <<= 1;                                                       \
164
1.00M
    sad_array[3] <<= 1;                                                       \
165
1.00M
  }
vpx_sad_skip_64x64x4d_avx2
Line
Count
Source
158
694k
                                     int ref_stride, uint32_t sad_array[4]) { \
159
694k
    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
160
694k
                    ((h) >> 1), sad_array);                                   \
161
694k
    sad_array[0] <<= 1;                                                       \
162
694k
    sad_array[1] <<= 1;                                                       \
163
694k
    sad_array[2] <<= 1;                                                       \
164
694k
    sad_array[3] <<= 1;                                                       \
165
694k
  }
vpx_sad_skip_64x32x4d_avx2
Line
Count
Source
158
306k
                                     int ref_stride, uint32_t sad_array[4]) { \
159
306k
    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
160
306k
                    ((h) >> 1), sad_array);                                   \
161
306k
    sad_array[0] <<= 1;                                                       \
162
306k
    sad_array[1] <<= 1;                                                       \
163
306k
    sad_array[2] <<= 1;                                                       \
164
306k
    sad_array[3] <<= 1;                                                       \
165
306k
  }
166
167
#define SADS32_H(h)                                                           \
168
  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
169
                                     const uint8_t *const ref_array[4],       \
170
3.03M
                                     int ref_stride, uint32_t sad_array[4]) { \
171
3.03M
    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
172
3.03M
                    ((h) >> 1), sad_array);                                   \
173
3.03M
    sad_array[0] <<= 1;                                                       \
174
3.03M
    sad_array[1] <<= 1;                                                       \
175
3.03M
    sad_array[2] <<= 1;                                                       \
176
3.03M
    sad_array[3] <<= 1;                                                       \
177
3.03M
  }
vpx_sad_skip_32x64x4d_avx2
Line
Count
Source
170
26.2k
                                     int ref_stride, uint32_t sad_array[4]) { \
171
26.2k
    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
172
26.2k
                    ((h) >> 1), sad_array);                                   \
173
26.2k
    sad_array[0] <<= 1;                                                       \
174
26.2k
    sad_array[1] <<= 1;                                                       \
175
26.2k
    sad_array[2] <<= 1;                                                       \
176
26.2k
    sad_array[3] <<= 1;                                                       \
177
26.2k
  }
vpx_sad_skip_32x32x4d_avx2
Line
Count
Source
170
2.21M
                                     int ref_stride, uint32_t sad_array[4]) { \
171
2.21M
    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
172
2.21M
                    ((h) >> 1), sad_array);                                   \
173
2.21M
    sad_array[0] <<= 1;                                                       \
174
2.21M
    sad_array[1] <<= 1;                                                       \
175
2.21M
    sad_array[2] <<= 1;                                                       \
176
2.21M
    sad_array[3] <<= 1;                                                       \
177
2.21M
  }
vpx_sad_skip_32x16x4d_avx2
Line
Count
Source
170
796k
                                     int ref_stride, uint32_t sad_array[4]) { \
171
796k
    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
172
796k
                    ((h) >> 1), sad_array);                                   \
173
796k
    sad_array[0] <<= 1;                                                       \
174
796k
    sad_array[1] <<= 1;                                                       \
175
796k
    sad_array[2] <<= 1;                                                       \
176
796k
    sad_array[3] <<= 1;                                                       \
177
796k
  }
178
179
SADS64_H(64)
180
SADS64_H(32)
181
182
SADS32_H(64)
183
SADS32_H(32)
184
SADS32_H(16)