Coverage Report

Created: 2023-06-07 06:31

/src/aom/av1/common/x86/cfl_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>
13
14
#include "av1/common/cfl.h"
15
#include "config/av1_rtcd.h"
16
17
877k
static INLINE __m128i fill_sum_epi32(__m128i l0) {
18
877k
  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
19
877k
  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
20
877k
}
21
22
static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
23
                                         int16_t *dst_ptr, int width,
24
                                         int height, int round_offset,
25
877k
                                         int num_pel_log2) {
26
877k
  const __m128i zeros = _mm_setzero_si128();
27
877k
  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
28
877k
  const __m128i *src = (__m128i *)src_ptr;
29
877k
  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
30
877k
  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
31
32
877k
  __m128i sum = zeros;
33
3.06M
  do {
34
3.06M
    __m128i l0;
35
3.06M
    if (width == 4) {
36
836k
      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
37
836k
                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
38
836k
      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
39
836k
                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
40
836k
      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
41
836k
                                             _mm_unpacklo_epi16(l1, zeros)));
42
2.23M
    } else {
43
2.23M
      if (width == 8) {
44
2.23M
        l0 = _mm_add_epi16(_mm_loadu_si128(src),
45
2.23M
                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
46
2.23M
      } else {
47
2
        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
48
2
      }
49
2.23M
      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
50
2.23M
                                             _mm_unpackhi_epi16(l0, zeros)));
51
2.23M
      if (width == 32) {
52
0
        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
53
0
        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
54
0
                                               _mm_unpackhi_epi16(l0, zeros)));
55
0
      }
56
2.23M
    }
57
3.06M
    src += step;
58
3.06M
  } while (src < end);
59
60
877k
  sum = fill_sum_epi32(sum);
61
62
877k
  __m128i avg_epi16 =
63
877k
      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
64
877k
  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
65
66
877k
  src = (__m128i *)src_ptr;
67
877k
  __m128i *dst = (__m128i *)dst_ptr;
68
7.80M
  do {
69
7.80M
    if (width == 4) {
70
3.34M
      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
71
4.46M
    } else {
72
4.46M
      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
73
4.46M
      if (width > 8) {
74
0
        _mm_storeu_si128(dst + 1,
75
0
                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
76
0
        if (width == 32) {
77
0
          _mm_storeu_si128(dst + 2,
78
0
                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
79
0
          _mm_storeu_si128(dst + 3,
80
0
                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
81
0
        }
82
0
      }
83
4.46M
    }
84
7.80M
    src += CFL_BUF_LINE_I128;
85
7.80M
    dst += CFL_BUF_LINE_I128;
86
7.80M
  } while (src < end);
87
877k
}
88
89
CFL_SUB_AVG_FN(sse2)