Coverage Report

Created: 2025-06-13 07:07

/src/aom/av1/common/x86/cfl_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>
13
14
#include "av1/common/cfl.h"
15
#include "config/av1_rtcd.h"
16
17
1.00M
static inline __m128i fill_sum_epi32(__m128i l0) {
18
1.00M
  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
19
1.00M
  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
20
1.00M
}
21
22
static inline void subtract_average_sse2(const uint16_t *src_ptr,
23
                                         int16_t *dst_ptr, int width,
24
                                         int height, int round_offset,
25
1.00M
                                         int num_pel_log2) {
26
1.00M
  const __m128i zeros = _mm_setzero_si128();
27
1.00M
  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
28
1.00M
  const __m128i *src = (__m128i *)src_ptr;
29
1.00M
  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
30
1.00M
  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
31
32
1.00M
  __m128i sum = zeros;
33
3.54M
  do {
34
3.54M
    __m128i l0;
35
3.54M
    if (width == 4) {
36
852k
      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
37
852k
                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
38
852k
      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
39
852k
                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
40
852k
      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
41
852k
                                             _mm_unpacklo_epi16(l1, zeros)));
42
2.69M
    } else {
43
2.69M
      if (width == 8) {
44
2.69M
        l0 = _mm_add_epi16(_mm_loadu_si128(src),
45
2.69M
                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
46
2.69M
      } else {
47
0
        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
48
0
      }
49
2.69M
      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
50
2.69M
                                             _mm_unpackhi_epi16(l0, zeros)));
51
2.69M
      if (width == 32) {
52
0
        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
53
0
        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
54
0
                                               _mm_unpackhi_epi16(l0, zeros)));
55
0
      }
56
2.69M
    }
57
3.54M
    src += step;
58
3.54M
  } while (src < end);
59
60
1.00M
  sum = fill_sum_epi32(sum);
61
62
1.00M
  __m128i avg_epi16 =
63
1.00M
      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
64
1.00M
  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
65
66
1.00M
  src = (__m128i *)src_ptr;
67
1.00M
  __m128i *dst = (__m128i *)dst_ptr;
68
8.79M
  do {
69
8.79M
    if (width == 4) {
70
3.41M
      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
71
5.38M
    } else {
72
5.38M
      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
73
5.38M
      if (width > 8) {
74
0
        _mm_storeu_si128(dst + 1,
75
0
                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
76
0
        if (width == 32) {
77
0
          _mm_storeu_si128(dst + 2,
78
0
                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
79
0
          _mm_storeu_si128(dst + 3,
80
0
                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
81
0
        }
82
0
      }
83
5.38M
    }
84
8.79M
    src += CFL_BUF_LINE_I128;
85
8.79M
    dst += CFL_BUF_LINE_I128;
86
8.79M
  } while (src < end);
87
1.00M
}
88
89
CFL_SUB_AVG_FN(sse2)