Coverage Report

Created: 2026-03-15 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/post_proc_sse2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <assert.h>
12
#include <emmintrin.h>
13
14
#include <stdio.h>
15
16
#include "./vpx_dsp_rtcd.h"
17
#include "vpx/vpx_integer.h"
18
#include "vpx_dsp/x86/mem_sse2.h"
19
20
extern const int16_t vpx_rv[];
21
22
void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
23
43.1k
                               int cols, int flimit) {
24
43.1k
  int col;
25
43.1k
  const __m128i zero = _mm_setzero_si128();
26
43.1k
  const __m128i f = _mm_set1_epi32(flimit);
27
43.1k
  DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
28
29
  // 8 columns are processed at a time.
30
  // If rows is less than 8 the bottom border extension fails.
31
43.1k
  assert(cols % 8 == 0);
32
43.1k
  assert(rows >= 8);
33
34
1.58M
  for (col = 0; col < cols; col += 8) {
35
1.54M
    int row, i;
36
1.54M
    __m128i s = _mm_loadl_epi64((__m128i *)dst);
37
1.54M
    __m128i sum, sumsq_0, sumsq_1;
38
1.54M
    __m128i tmp_0, tmp_1;
39
1.54M
    __m128i below_context = _mm_setzero_si128();
40
41
1.54M
    s = _mm_unpacklo_epi8(s, zero);
42
43
13.8M
    for (i = 0; i < 8; ++i) {
44
12.3M
      _mm_store_si128((__m128i *)above_context + i, s);
45
12.3M
    }
46
47
    // sum *= 9
48
1.54M
    sum = _mm_slli_epi16(s, 3);
49
1.54M
    sum = _mm_add_epi16(s, sum);
50
51
    // sum^2 * 9 == (sum * 9) * sum
52
1.54M
    tmp_0 = _mm_mullo_epi16(sum, s);
53
1.54M
    tmp_1 = _mm_mulhi_epi16(sum, s);
54
55
1.54M
    sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
56
1.54M
    sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
57
58
    // Prime sum/sumsq
59
10.8M
    for (i = 1; i <= 6; ++i) {
60
9.26M
      __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
61
9.26M
      a = _mm_unpacklo_epi8(a, zero);
62
9.26M
      sum = _mm_add_epi16(sum, a);
63
9.26M
      a = _mm_mullo_epi16(a, a);
64
9.26M
      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
65
9.26M
      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
66
9.26M
    }
67
68
749M
    for (row = 0; row < rows + 8; row++) {
69
748M
      const __m128i above =
70
748M
          _mm_load_si128((__m128i *)above_context + (row & 7));
71
748M
      __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
72
748M
      __m128i above_sq, below_sq;
73
748M
      __m128i mask_0, mask_1;
74
748M
      __m128i multmp_0, multmp_1;
75
748M
      __m128i rv;
76
748M
      __m128i out;
77
78
748M
      this_row = _mm_unpacklo_epi8(this_row, zero);
79
80
748M
      if (row + 7 < rows) {
81
        // Instead of copying the end context we just stop loading when we get
82
        // to the last one.
83
724M
        below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
84
724M
        below_context = _mm_unpacklo_epi8(below_context, zero);
85
724M
      }
86
87
748M
      sum = _mm_sub_epi16(sum, above);
88
748M
      sum = _mm_add_epi16(sum, below_context);
89
90
      // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
91
      // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
92
      // because x86 does not have unpack with sign extension.
93
748M
      above_sq = _mm_mullo_epi16(above, above);
94
748M
      sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
95
748M
      sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
96
97
748M
      below_sq = _mm_mullo_epi16(below_context, below_context);
98
748M
      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
99
748M
      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
100
101
      // sumsq * 16 - sumsq == sumsq * 15
102
748M
      mask_0 = _mm_slli_epi32(sumsq_0, 4);
103
748M
      mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
104
748M
      mask_1 = _mm_slli_epi32(sumsq_1, 4);
105
748M
      mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
106
107
748M
      multmp_0 = _mm_mullo_epi16(sum, sum);
108
748M
      multmp_1 = _mm_mulhi_epi16(sum, sum);
109
110
748M
      mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
111
748M
      mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
112
113
      // mask - f gives a negative value when mask < f
114
748M
      mask_0 = _mm_sub_epi32(mask_0, f);
115
748M
      mask_1 = _mm_sub_epi32(mask_1, f);
116
117
      // Shift the sign bit down to create a mask
118
748M
      mask_0 = _mm_srai_epi32(mask_0, 31);
119
748M
      mask_1 = _mm_srai_epi32(mask_1, 31);
120
121
748M
      mask_0 = _mm_packs_epi32(mask_0, mask_1);
122
123
748M
      rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
124
125
748M
      mask_1 = _mm_add_epi16(rv, sum);
126
748M
      mask_1 = _mm_add_epi16(mask_1, this_row);
127
748M
      mask_1 = _mm_srai_epi16(mask_1, 4);
128
129
748M
      mask_1 = _mm_and_si128(mask_0, mask_1);
130
748M
      mask_0 = _mm_andnot_si128(mask_0, this_row);
131
748M
      out = _mm_or_si128(mask_1, mask_0);
132
133
748M
      _mm_storel_epi64((__m128i *)(dst + row * pitch),
134
748M
                       _mm_packus_epi16(out, zero));
135
136
748M
      _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
137
748M
    }
138
139
1.54M
    dst += 8;
140
1.54M
  }
141
43.1k
}