Coverage Report

Created: 2026-06-15 06:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/x86/sse-deblk.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
// SSE4.1 8-bit deblocking. One edge segment = 4 lines along the edge. The four
22
// lines all use the same per-edge parameters and (for luma) the same strong/
23
// weak choice, so they are processed in parallel as the 4 int32 lanes of an
24
// xmm register. Each sample position (p3..q3) becomes one vector-of-4-lines.
25
// For horizontal edges those vectors are 4 contiguous bytes (one per line);
26
// for vertical edges the 4 lines are strided, so load/store transpose a small
27
// block. The arithmetic is identical to the scalar kernels -> bit-exact.
28
29
#include "x86/sse-deblk.h"
30
31
#ifdef HAVE_CONFIG_H
32
#include "config.h"
33
#endif
34
35
#if HAVE_SSE4_1
36
37
#include <string.h>
38
#include <smmintrin.h> // SSE4.1
39
40
namespace {
41
42
0
inline __m128i load4(const uint8_t* p) {           // 4 bytes -> 4 int32
43
0
  int32_t t; memcpy(&t, p, 4);
44
0
  return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(t));
45
0
}
46
0
inline void store4(uint8_t* p, __m128i v) {        // 4 int32 (0..255) -> 4 bytes
47
0
  __m128i b = _mm_packus_epi16(_mm_packus_epi32(v, v), _mm_setzero_si128());
48
0
  int32_t t = _mm_cvtsi128_si32(b);
49
0
  memcpy(p, &t, 4);
50
0
}
51
52
0
inline __m128i clip3(__m128i lo, __m128i hi, __m128i v) {
53
0
  return _mm_min_epi32(_mm_max_epi32(v, lo), hi);
54
0
}
55
0
inline __m128i clip_u8(__m128i v) {
56
0
  return _mm_min_epi32(_mm_max_epi32(v, _mm_setzero_si128()), _mm_set1_epi32(255));
57
0
}
58
0
inline __m128i x2(__m128i a){ return _mm_add_epi32(a,a); }
59
0
inline __m128i add3(__m128i a,__m128i b,__m128i c){ return _mm_add_epi32(_mm_add_epi32(a,b),c); }
60
61
// --- vertical: load/store transpose a 4(lines) x 8(samples) block ----------
62
63
0
inline void load_vert(const uint8_t* ptr, ptrdiff_t stride, __m128i s[8]) {
64
0
  __m128i r0 = _mm_loadl_epi64((const __m128i*)(ptr-4));
65
0
  __m128i r1 = _mm_loadl_epi64((const __m128i*)(ptr-4+stride));
66
0
  __m128i r2 = _mm_loadl_epi64((const __m128i*)(ptr-4+2*stride));
67
0
  __m128i r3 = _mm_loadl_epi64((const __m128i*)(ptr-4+3*stride));
68
0
  __m128i e  = _mm_unpacklo_epi8(r0, r1);
69
0
  __m128i f  = _mm_unpacklo_epi8(r2, r3);
70
0
  __m128i lo = _mm_unpacklo_epi16(e, f);   // samples p3 p2 p1 p0 (4 bytes each, 4 lines)
71
0
  __m128i hi = _mm_unpackhi_epi16(e, f);   // samples q0 q1 q2 q3
72
0
  s[0]=_mm_cvtepu8_epi32(lo);
73
0
  s[1]=_mm_cvtepu8_epi32(_mm_srli_si128(lo,4));
74
0
  s[2]=_mm_cvtepu8_epi32(_mm_srli_si128(lo,8));
75
0
  s[3]=_mm_cvtepu8_epi32(_mm_srli_si128(lo,12));
76
0
  s[4]=_mm_cvtepu8_epi32(hi);
77
0
  s[5]=_mm_cvtepu8_epi32(_mm_srli_si128(hi,4));
78
0
  s[6]=_mm_cvtepu8_epi32(_mm_srli_si128(hi,8));
79
0
  s[7]=_mm_cvtepu8_epi32(_mm_srli_si128(hi,12));
80
0
}
81
82
0
inline void store_vert(uint8_t* ptr, ptrdiff_t stride, const __m128i s[8]) {
83
0
  __m128i lo = _mm_packus_epi16(_mm_packus_epi32(s[0],s[1]), _mm_packus_epi32(s[2],s[3]));
84
0
  __m128i hi = _mm_packus_epi16(_mm_packus_epi32(s[4],s[5]), _mm_packus_epi32(s[6],s[7]));
85
0
  const __m128i base = _mm_setr_epi8(0,4,8,12, (char)0x80,(char)0x80,(char)0x80,(char)0x80,
86
0
                                     (char)0x80,(char)0x80,(char)0x80,(char)0x80,
87
0
                                     (char)0x80,(char)0x80,(char)0x80,(char)0x80);
88
0
  for (int k=0;k<4;k++) {
89
0
    __m128i mask = _mm_add_epi8(base, _mm_set1_epi8((char)k));  // {k,4+k,8+k,12+k, 0x80..}
90
0
    __m128i a = _mm_shuffle_epi8(lo, mask);   // low4 = s0..s3 of line k
91
0
    __m128i b = _mm_shuffle_epi8(hi, mask);   // low4 = s4..s7 of line k
92
0
    __m128i row = _mm_unpacklo_epi32(a, b);   // low8 = the 8 samples of line k
93
0
    _mm_storel_epi64((__m128i*)(ptr-4+k*stride), row);
94
0
  }
95
0
}
96
97
0
inline void load_horiz(const uint8_t* ptr, ptrdiff_t stride, __m128i s[8]) {
98
0
  s[0]=load4(ptr-4*stride); s[1]=load4(ptr-3*stride); s[2]=load4(ptr-2*stride); s[3]=load4(ptr-1*stride);
99
0
  s[4]=load4(ptr+0*stride); s[5]=load4(ptr+1*stride); s[6]=load4(ptr+2*stride); s[7]=load4(ptr+3*stride);
100
0
}
101
0
inline void store_horiz(uint8_t* ptr, ptrdiff_t stride, const __m128i s[8]) {
102
0
  store4(ptr-4*stride,s[0]); store4(ptr-3*stride,s[1]); store4(ptr-2*stride,s[2]); store4(ptr-1*stride,s[3]);
103
0
  store4(ptr+0*stride,s[4]); store4(ptr+1*stride,s[5]); store4(ptr+2*stride,s[6]); store4(ptr+3*stride,s[7]);
104
0
}
105
106
} // namespace
107
108
109
void deblock_luma_8_sse4(uint8_t* ptr, ptrdiff_t stride, int vertical,
110
                         int dE, int dEp, int dEq, int tc, int filterP, int filterQ)
111
0
{
112
0
  __m128i s[8];
113
0
  if (vertical) load_vert(ptr, stride, s); else load_horiz(ptr, stride, s);
114
115
0
  const __m128i p3=s[0], p2=s[1], p1=s[2], p0=s[3];
116
0
  const __m128i q0=s[4], q1=s[5], q2=s[6], q3=s[7];
117
118
0
  if (dE==2) {
119
    // strong filtering
120
0
    const __m128i v2tc = _mm_set1_epi32(2*tc);
121
0
    const __m128i c4   = _mm_set1_epi32(4);
122
0
    const __m128i c2   = _mm_set1_epi32(2);
123
124
0
    __m128i pn0 = _mm_srai_epi32(_mm_add_epi32(add3(p2, x2(p1), x2(p0)), add3(x2(q0), q1, c4)), 3);
125
0
    pn0 = clip3(_mm_sub_epi32(p0,v2tc), _mm_add_epi32(p0,v2tc), pn0);
126
0
    __m128i pn1 = _mm_srai_epi32(_mm_add_epi32(add3(p2,p1,p0), _mm_add_epi32(q0,c2)), 2);
127
0
    pn1 = clip3(_mm_sub_epi32(p1,v2tc), _mm_add_epi32(p1,v2tc), pn1);
128
0
    __m128i pn2 = _mm_srai_epi32(_mm_add_epi32(add3(x2(p3), _mm_add_epi32(x2(p2),p2), p1), add3(p0,q0,c4)), 3);
129
0
    pn2 = clip3(_mm_sub_epi32(p2,v2tc), _mm_add_epi32(p2,v2tc), pn2);
130
131
0
    __m128i qn0 = _mm_srai_epi32(_mm_add_epi32(add3(p1, x2(p0), x2(q0)), add3(x2(q1), q2, c4)), 3);
132
0
    qn0 = clip3(_mm_sub_epi32(q0,v2tc), _mm_add_epi32(q0,v2tc), qn0);
133
0
    __m128i qn1 = _mm_srai_epi32(_mm_add_epi32(add3(p0,q0,q1), _mm_add_epi32(q2,c2)), 2);
134
0
    qn1 = clip3(_mm_sub_epi32(q1,v2tc), _mm_add_epi32(q1,v2tc), qn1);
135
0
    __m128i qn2 = _mm_srai_epi32(_mm_add_epi32(add3(p0,q0,q1), add3(_mm_add_epi32(x2(q2),q2), x2(q3), c4)), 3);
136
0
    qn2 = clip3(_mm_sub_epi32(q2,v2tc), _mm_add_epi32(q2,v2tc), qn2);
137
138
0
    if (filterP) { s[3]=pn0; s[2]=pn1; s[1]=pn2; }
139
0
    if (filterQ) { s[4]=qn0; s[5]=qn1; s[6]=qn2; }
140
0
  }
141
0
  else {
142
    // weak filtering
143
0
    const __m128i vtc  = _mm_set1_epi32(tc);
144
0
    const __m128i delta0 = _mm_srai_epi32(
145
0
        _mm_add_epi32(_mm_sub_epi32(_mm_mullo_epi32(_mm_set1_epi32(9), _mm_sub_epi32(q0,p0)),
146
0
                                    _mm_mullo_epi32(_mm_set1_epi32(3), _mm_sub_epi32(q1,p1))),
147
0
                      _mm_set1_epi32(8)), 4);
148
    // per-line mask: abs(delta) < tc*10
149
0
    __m128i mask = _mm_cmpgt_epi32(_mm_set1_epi32(tc*10), _mm_abs_epi32(delta0));
150
0
    __m128i delta = clip3(_mm_set1_epi32(-tc), vtc, delta0);
151
152
0
    if (filterP) {
153
0
      __m128i p0n = clip_u8(_mm_add_epi32(p0, delta));
154
0
      s[3] = _mm_blendv_epi8(p0, p0n, mask);
155
0
    }
156
0
    if (filterQ) {
157
0
      __m128i q0n = clip_u8(_mm_sub_epi32(q0, delta));
158
0
      s[4] = _mm_blendv_epi8(q0, q0n, mask);
159
0
    }
160
0
    if (dEp && filterP) {
161
0
      const __m128i htc = _mm_set1_epi32(tc>>1);
162
0
      __m128i dp = _mm_srai_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(p2,p0),_mm_set1_epi32(1)),1), p1), delta), 1);
163
0
      dp = clip3(_mm_sub_epi32(_mm_setzero_si128(),htc), htc, dp);
164
0
      __m128i p1n = clip_u8(_mm_add_epi32(p1, dp));
165
0
      s[2] = _mm_blendv_epi8(p1, p1n, mask);
166
0
    }
167
0
    if (dEq && filterQ) {
168
0
      const __m128i htc = _mm_set1_epi32(tc>>1);
169
0
      __m128i dq = _mm_srai_epi32(_mm_sub_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(q2,q0),_mm_set1_epi32(1)),1), q1), delta), 1);
170
0
      dq = clip3(_mm_sub_epi32(_mm_setzero_si128(),htc), htc, dq);
171
0
      __m128i q1n = clip_u8(_mm_add_epi32(q1, dq));
172
0
      s[5] = _mm_blendv_epi8(q1, q1n, mask);
173
0
    }
174
0
  }
175
176
0
  if (vertical) store_vert(ptr, stride, s); else store_horiz(ptr, stride, s);
177
0
}
178
179
// Note: an SSE chroma deblock filter was implemented and benchmarked too, but
180
// the chroma filter is a single delta per line -- so trivial that it is fully
181
// load/store-bound (the vertical case needs a strided 2-column scatter), and
182
// SSE measured slower than scalar (~0.5-0.9x). Chroma deblock therefore stays
183
// on the scalar fallback; only luma is accelerated here.
184
185
#endif // HAVE_SSE4_1