Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/sse_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
#include <smmintrin.h>
13
#include <stdint.h>
14
15
#include "./vpx_config.h"
16
#include "./vpx_dsp_rtcd.h"
17
18
#include "vpx_ports/mem.h"
19
#include "vpx_dsp/x86/mem_sse2.h"
20
21
static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
22
0
                                const uint8_t *b) {
23
0
  const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a);
24
0
  const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b);
25
0
  const __m256i zero = _mm256_setzero_si256();
26
0
  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
27
0
  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
28
0
  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
29
0
  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
30
0
  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
31
0
  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
32
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
33
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
34
0
}
35
36
4.77M
static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
37
4.77M
  int64_t sum;
38
4.77M
  __m256i zero = _mm256_setzero_si256();
39
4.77M
  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
40
4.77M
  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
41
4.77M
  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
42
4.77M
  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
43
4.77M
                                         _mm256_extracti128_si256(sum_4x64, 1));
44
4.77M
  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
45
4.77M
  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
46
4.77M
  return sum;
47
4.77M
}
48
49
#if CONFIG_VP9_HIGHBITDEPTH
50
0
static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
51
0
  const __m256i sum0_4x64 =
52
0
      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
53
0
  const __m256i sum1_4x64 =
54
0
      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
55
0
  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
56
0
  *sum = _mm256_add_epi64(*sum, sum_4x64);
57
0
}
58
59
0
static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
60
0
  int64_t sum;
61
0
  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
62
0
                                         _mm256_extracti128_si256(sum_4x64, 1));
63
0
  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
64
65
0
  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
66
0
  return sum;
67
0
}
68
#endif
69
70
static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
71
0
                                 const uint8_t *b, int b_stride, __m256i *sum) {
72
0
  const __m128i v_a0 = load_unaligned_u32(a);
73
0
  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
74
0
  const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2);
75
0
  const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3);
76
0
  const __m128i v_b0 = load_unaligned_u32(b);
77
0
  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
78
0
  const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2);
79
0
  const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3);
80
0
  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
81
0
                                             _mm_unpacklo_epi32(v_a2, v_a3));
82
0
  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
83
0
                                             _mm_unpacklo_epi32(v_b2, v_b3));
84
0
  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
85
0
  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
86
0
  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
87
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
88
0
}
89
90
static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
91
0
                                 const uint8_t *b, int b_stride, __m256i *sum) {
92
0
  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
93
0
  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
94
0
  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
95
0
  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
96
0
  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
97
0
  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
98
0
  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
99
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
100
0
}
101
102
int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
103
4.77M
                     int b_stride, int width, int height) {
104
4.77M
  int32_t y = 0;
105
4.77M
  int64_t sse = 0;
106
4.77M
  __m256i sum = _mm256_setzero_si256();
107
4.77M
  __m256i zero = _mm256_setzero_si256();
108
4.77M
  switch (width) {
109
0
    case 4:
110
0
      do {
111
0
        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
112
0
        a += a_stride << 2;
113
0
        b += b_stride << 2;
114
0
        y += 4;
115
0
      } while (y < height);
116
0
      sse = summary_all_avx2(&sum);
117
0
      break;
118
0
    case 8:
119
0
      do {
120
0
        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
121
0
        a += a_stride << 1;
122
0
        b += b_stride << 1;
123
0
        y += 2;
124
0
      } while (y < height);
125
0
      sse = summary_all_avx2(&sum);
126
0
      break;
127
4.77M
    case 16:
128
38.1M
      do {
129
38.1M
        const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
130
38.1M
        const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride));
131
38.1M
        const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
132
38.1M
        const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride));
133
38.1M
        const __m256i v_a =
134
38.1M
            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
135
38.1M
        const __m256i v_b =
136
38.1M
            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
137
38.1M
        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
138
38.1M
        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
139
38.1M
        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
140
38.1M
        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
141
38.1M
        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
142
38.1M
        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
143
38.1M
        const __m256i temp =
144
38.1M
            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
145
38.1M
                             _mm256_madd_epi16(v_bsub, v_bsub));
146
38.1M
        sum = _mm256_add_epi32(sum, temp);
147
38.1M
        a += a_stride << 1;
148
38.1M
        b += b_stride << 1;
149
38.1M
        y += 2;
150
38.1M
      } while (y < height);
151
4.77M
      sse = summary_all_avx2(&sum);
152
4.77M
      break;
153
0
    case 32:
154
0
      do {
155
0
        sse_w32_avx2(&sum, a, b);
156
0
        a += a_stride;
157
0
        b += b_stride;
158
0
        y += 1;
159
0
      } while (y < height);
160
0
      sse = summary_all_avx2(&sum);
161
0
      break;
162
0
    case 64:
163
0
      do {
164
0
        sse_w32_avx2(&sum, a, b);
165
0
        sse_w32_avx2(&sum, a + 32, b + 32);
166
0
        a += a_stride;
167
0
        b += b_stride;
168
0
        y += 1;
169
0
      } while (y < height);
170
0
      sse = summary_all_avx2(&sum);
171
0
      break;
172
0
    default:
173
0
      if ((width & 0x07) == 0) {
174
0
        do {
175
0
          int i = 0;
176
0
          do {
177
0
            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
178
0
            i += 8;
179
0
          } while (i < width);
180
0
          a += a_stride << 1;
181
0
          b += b_stride << 1;
182
0
          y += 2;
183
0
        } while (y < height);
184
0
      } else {
185
0
        do {
186
0
          int i = 0;
187
0
          do {
188
0
            const uint8_t *a2;
189
0
            const uint8_t *b2;
190
0
            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
191
0
            a2 = a + i + (a_stride << 1);
192
0
            b2 = b + i + (b_stride << 1);
193
0
            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
194
0
            i += 8;
195
0
          } while (i + 4 < width);
196
0
          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
197
0
          a += a_stride << 2;
198
0
          b += b_stride << 2;
199
0
          y += 4;
200
0
        } while (y < height);
201
0
      }
202
0
      sse = summary_all_avx2(&sum);
203
0
      break;
204
4.77M
  }
205
206
4.77M
  return sse;
207
4.77M
}
208
209
#if CONFIG_VP9_HIGHBITDEPTH
210
static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
211
0
                                       const uint16_t *b) {
212
0
  const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a);
213
0
  const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b);
214
0
  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
215
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
216
0
}
217
218
static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
219
                                        int a_stride, const uint16_t *b,
220
0
                                        int b_stride) {
221
0
  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
222
0
  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
223
0
  const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2));
224
0
  const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3));
225
0
  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
226
0
  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
227
0
  const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2));
228
0
  const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3));
229
0
  const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1);
230
0
  const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3);
231
0
  const __m256i v_a_w =
232
0
      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
233
0
  const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1);
234
0
  const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3);
235
0
  const __m256i v_b_w =
236
0
      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
237
0
  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
238
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
239
0
}
240
241
static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
242
                                        int a_stride, const uint16_t *b,
243
0
                                        int b_stride) {
244
0
  const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride));
245
0
  const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a);
246
0
  const __m256i v_a_w =
247
0
      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
248
0
  const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride));
249
0
  const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b);
250
0
  const __m256i v_b_w =
251
0
      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
252
0
  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
253
0
  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
254
0
}
255
256
int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
257
0
                            int b_stride, int width, int height) {
258
0
  int32_t y = 0;
259
0
  int64_t sse = 0;
260
0
  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
261
0
  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
262
0
  __m256i sum = _mm256_setzero_si256();
263
0
  switch (width) {
264
0
    case 4:
265
0
      do {
266
0
        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
267
0
        a += a_stride << 2;
268
0
        b += b_stride << 2;
269
0
        y += 4;
270
0
      } while (y < height);
271
0
      sse = summary_all_avx2(&sum);
272
0
      break;
273
0
    case 8:
274
0
      do {
275
0
        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
276
0
        a += a_stride << 1;
277
0
        b += b_stride << 1;
278
0
        y += 2;
279
0
      } while (y < height);
280
0
      sse = summary_all_avx2(&sum);
281
0
      break;
282
0
    case 16:
283
0
      do {
284
0
        highbd_sse_w16_avx2(&sum, a, b);
285
0
        a += a_stride;
286
0
        b += b_stride;
287
0
        y += 1;
288
0
      } while (y < height);
289
0
      sse = summary_all_avx2(&sum);
290
0
      break;
291
0
    case 32:
292
0
      do {
293
0
        int l = 0;
294
0
        __m256i sum32 = _mm256_setzero_si256();
295
0
        do {
296
0
          highbd_sse_w16_avx2(&sum32, a, b);
297
0
          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
298
0
          a += a_stride;
299
0
          b += b_stride;
300
0
          l += 1;
301
0
        } while (l < 64 && l < (height - y));
302
0
        summary_32_avx2(&sum32, &sum);
303
0
        y += 64;
304
0
      } while (y < height);
305
0
      sse = summary_4x64_avx2(sum);
306
0
      break;
307
0
    case 64:
308
0
      do {
309
0
        int l = 0;
310
0
        __m256i sum32 = _mm256_setzero_si256();
311
0
        do {
312
0
          highbd_sse_w16_avx2(&sum32, a, b);
313
0
          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
314
0
          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
315
0
          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
316
0
          a += a_stride;
317
0
          b += b_stride;
318
0
          l += 1;
319
0
        } while (l < 32 && l < (height - y));
320
0
        summary_32_avx2(&sum32, &sum);
321
0
        y += 32;
322
0
      } while (y < height);
323
0
      sse = summary_4x64_avx2(sum);
324
0
      break;
325
0
    default:
326
0
      if (width & 0x7) {
327
0
        do {
328
0
          int i = 0;
329
0
          __m256i sum32 = _mm256_setzero_si256();
330
0
          do {
331
0
            const uint16_t *a2;
332
0
            const uint16_t *b2;
333
0
            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
334
0
            a2 = a + i + (a_stride << 1);
335
0
            b2 = b + i + (b_stride << 1);
336
0
            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
337
0
            i += 8;
338
0
          } while (i + 4 < width);
339
0
          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
340
0
          summary_32_avx2(&sum32, &sum);
341
0
          a += a_stride << 2;
342
0
          b += b_stride << 2;
343
0
          y += 4;
344
0
        } while (y < height);
345
0
      } else {
346
0
        do {
347
0
          int l = 0;
348
0
          __m256i sum32 = _mm256_setzero_si256();
349
0
          do {
350
0
            int i = 0;
351
0
            do {
352
0
              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
353
0
              i += 8;
354
0
            } while (i < width);
355
0
            a += a_stride << 1;
356
0
            b += b_stride << 1;
357
0
            l += 2;
358
0
          } while (l < 8 && l < (height - y));
359
0
          summary_32_avx2(&sum32, &sum);
360
0
          y += 8;
361
0
        } while (y < height);
362
0
      }
363
0
      sse = summary_4x64_avx2(sum);
364
0
      break;
365
0
  }
366
0
  return sse;
367
0
}
368
#endif  // CONFIG_VP9_HIGHBITDEPTH