Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/sad_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <immintrin.h>
11
#include "./vpx_dsp_rtcd.h"
12
#include "vpx_ports/mem.h"
13
14
static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
15
                                        const uint8_t *ref_ptr, int ref_stride,
16
2.78M
                                        int h) {
17
2.78M
  int i, res;
18
2.78M
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
19
2.78M
  __m256i sum_sad = _mm256_setzero_si256();
20
2.78M
  __m256i sum_sad_h;
21
2.78M
  __m128i sum_sad128;
22
108M
  for (i = 0; i < h; i++) {
23
106M
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
24
106M
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
25
106M
    sad1_reg =
26
106M
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
27
106M
    sad2_reg = _mm256_sad_epu8(
28
106M
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
29
106M
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
30
106M
    ref_ptr += ref_stride;
31
106M
    src_ptr += src_stride;
32
106M
  }
33
2.78M
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
34
2.78M
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
35
2.78M
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
36
2.78M
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
37
2.78M
  res = _mm_cvtsi128_si32(sum_sad128);
38
2.78M
  return res;
39
2.78M
}
40
41
static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
42
                                        const uint8_t *ref_ptr, int ref_stride,
43
14.7M
                                        int h) {
44
14.7M
  int i, res;
45
14.7M
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
46
14.7M
  __m256i sum_sad = _mm256_setzero_si256();
47
14.7M
  __m256i sum_sad_h;
48
14.7M
  __m128i sum_sad128;
49
14.7M
  const int ref2_stride = ref_stride << 1;
50
14.7M
  const int src2_stride = src_stride << 1;
51
14.7M
  const int max = h >> 1;
52
197M
  for (i = 0; i < max; i++) {
53
183M
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
54
183M
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
55
183M
    sad1_reg =
56
183M
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
57
183M
    sad2_reg = _mm256_sad_epu8(
58
183M
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
59
183M
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
60
183M
    ref_ptr += ref2_stride;
61
183M
    src_ptr += src2_stride;
62
183M
  }
63
14.7M
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
64
14.7M
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
65
14.7M
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
66
14.7M
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
67
14.7M
  res = _mm_cvtsi128_si32(sum_sad128);
68
14.7M
  return res;
69
14.7M
}
70
71
#define FSAD64_H(h)                                                           \
72
  unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
73
2.05M
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
2.05M
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
2.05M
  }
vpx_sad64x64_avx2
Line
Count
Source
73
644k
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
644k
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
644k
  }
vpx_sad64x32_avx2
Line
Count
Source
73
1.40M
                                    const uint8_t *ref_ptr, int ref_stride) { \
74
1.40M
    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75
1.40M
  }
76
77
#define FSADS64_H(h)                                                          \
78
  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
79
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
80
729k
      int ref_stride) {                                                       \
81
729k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
729k
                            h / 2);                                           \
83
729k
  }
vpx_sad_skip_64x64_avx2
Line
Count
Source
80
515k
      int ref_stride) {                                                       \
81
515k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
515k
                            h / 2);                                           \
83
515k
  }
vpx_sad_skip_64x32_avx2
Line
Count
Source
80
214k
      int ref_stride) {                                                       \
81
214k
    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82
214k
                            h / 2);                                           \
83
214k
  }
84
85
#define FSAD32_H(h)                                                           \
86
  unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
87
10.4M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
10.4M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
10.4M
  }
vpx_sad32x64_avx2
Line
Count
Source
87
1.21M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
1.21M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
1.21M
  }
vpx_sad32x32_avx2
Line
Count
Source
87
4.86M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
4.86M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
4.86M
  }
vpx_sad32x16_avx2
Line
Count
Source
87
4.37M
                                    const uint8_t *ref_ptr, int ref_stride) { \
88
4.37M
    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89
4.37M
  }
90
91
#define FSADS32_H(h)                                                          \
92
  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
93
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
94
4.26M
      int ref_stride) {                                                       \
95
4.26M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
4.26M
                            h / 2);                                           \
97
4.26M
  }
vpx_sad_skip_32x64_avx2
Line
Count
Source
94
110k
      int ref_stride) {                                                       \
95
110k
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
110k
                            h / 2);                                           \
97
110k
  }
vpx_sad_skip_32x32_avx2
Line
Count
Source
94
3.22M
      int ref_stride) {                                                       \
95
3.22M
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
3.22M
                            h / 2);                                           \
97
3.22M
  }
vpx_sad_skip_32x16_avx2
Line
Count
Source
94
930k
      int ref_stride) {                                                       \
95
930k
    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96
930k
                            h / 2);                                           \
97
930k
  }
98
99
#define FSAD64  \
100
  FSAD64_H(64)  \
101
  FSAD64_H(32)  \
102
  FSADS64_H(64) \
103
  FSADS64_H(32)
104
105
#define FSAD32  \
106
  FSAD32_H(64)  \
107
  FSAD32_H(32)  \
108
  FSAD32_H(16)  \
109
  FSADS32_H(64) \
110
  FSADS32_H(32) \
111
  FSADS32_H(16)
112
113
FSAD64
114
FSAD32
115
116
#undef FSAD64
117
#undef FSAD32
118
#undef FSAD64_H
119
#undef FSAD32_H
120
#undef FSADS64_H
121
#undef FSADS32_H
122
123
#define FSADAVG64_H(h)                                                        \
124
  unsigned int vpx_sad64x##h##_avg_avx2(                                      \
125
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
126
0
      int ref_stride, const uint8_t *second_pred) {                           \
127
0
    int i;                                                                    \
128
0
    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
129
0
    __m256i sum_sad = _mm256_setzero_si256();                                 \
130
0
    __m256i sum_sad_h;                                                        \
131
0
    __m128i sum_sad128;                                                       \
132
0
    for (i = 0; i < h; i++) {                                                 \
133
0
      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
134
0
      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
135
0
      ref1_reg = _mm256_avg_epu8(                                             \
136
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
137
0
      ref2_reg = _mm256_avg_epu8(                                             \
138
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
139
0
      sad1_reg = _mm256_sad_epu8(                                             \
140
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
141
0
      sad2_reg = _mm256_sad_epu8(                                             \
142
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
143
0
      sum_sad =                                                               \
144
0
          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
145
0
      ref_ptr += ref_stride;                                                  \
146
0
      src_ptr += src_stride;                                                  \
147
0
      second_pred += 64;                                                      \
148
0
    }                                                                         \
149
0
    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
150
0
    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
151
0
    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
152
0
    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
153
0
    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
154
0
  }
Unexecuted instantiation: vpx_sad64x64_avg_avx2
Unexecuted instantiation: vpx_sad64x32_avg_avx2
155
156
#define FSADAVG32_H(h)                                                        \
157
  unsigned int vpx_sad32x##h##_avg_avx2(                                      \
158
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
159
0
      int ref_stride, const uint8_t *second_pred) {                           \
160
0
    int i;                                                                    \
161
0
    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
162
0
    __m256i sum_sad = _mm256_setzero_si256();                                 \
163
0
    __m256i sum_sad_h;                                                        \
164
0
    __m128i sum_sad128;                                                       \
165
0
    int ref2_stride = ref_stride << 1;                                        \
166
0
    int src2_stride = src_stride << 1;                                        \
167
0
    int max = h >> 1;                                                         \
168
0
    for (i = 0; i < max; i++) {                                               \
169
0
      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
170
0
      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
171
0
      ref1_reg = _mm256_avg_epu8(                                             \
172
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
173
0
      ref2_reg = _mm256_avg_epu8(                                             \
174
0
          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
175
0
      sad1_reg = _mm256_sad_epu8(                                             \
176
0
          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
177
0
      sad2_reg = _mm256_sad_epu8(                                             \
178
0
          ref2_reg,                                                           \
179
0
          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
180
0
      sum_sad =                                                               \
181
0
          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
182
0
      ref_ptr += ref2_stride;                                                 \
183
0
      src_ptr += src2_stride;                                                 \
184
0
      second_pred += 64;                                                      \
185
0
    }                                                                         \
186
0
    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
187
0
    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
188
0
    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
189
0
    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
190
0
    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
191
0
  }
Unexecuted instantiation: vpx_sad32x64_avg_avx2
Unexecuted instantiation: vpx_sad32x32_avg_avx2
Unexecuted instantiation: vpx_sad32x16_avg_avx2
192
193
#define FSADAVG64 \
194
  FSADAVG64_H(64) \
195
  FSADAVG64_H(32)
196
197
#define FSADAVG32 \
198
  FSADAVG32_H(64) \
199
  FSADAVG32_H(32) \
200
  FSADAVG32_H(16)
201
202
FSADAVG64
203
FSADAVG32
204
205
#undef FSADAVG64
206
#undef FSADAVG32
207
#undef FSADAVG64_H
208
#undef FSADAVG32_H