/work/libde265/libde265/x86/sse-deblk.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | // SSE4.1 8-bit deblocking. One edge segment = 4 lines along the edge. The four |
22 | | // lines all use the same per-edge parameters and (for luma) the same strong/ |
23 | | // weak choice, so they are processed in parallel as the 4 int32 lanes of an |
24 | | // xmm register. Each sample position (p3..q3) becomes one vector-of-4-lines. |
25 | | // For horizontal edges those vectors are 4 contiguous bytes (one per line); |
26 | | // for vertical edges the 4 lines are strided, so load/store transpose a small |
27 | | // block. The arithmetic is identical to the scalar kernels -> bit-exact. |
28 | | |
29 | | #include "x86/sse-deblk.h" |
30 | | |
31 | | #ifdef HAVE_CONFIG_H |
32 | | #include "config.h" |
33 | | #endif |
34 | | |
35 | | #if HAVE_SSE4_1 |
36 | | |
37 | | #include <string.h> |
38 | | #include <smmintrin.h> // SSE4.1 |
39 | | |
40 | | namespace { |
41 | | |
42 | 0 | inline __m128i load4(const uint8_t* p) { // 4 bytes -> 4 int32 |
43 | 0 | int32_t t; memcpy(&t, p, 4); |
44 | 0 | return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(t)); |
45 | 0 | } |
46 | 0 | inline void store4(uint8_t* p, __m128i v) { // 4 int32 (0..255) -> 4 bytes |
47 | 0 | __m128i b = _mm_packus_epi16(_mm_packus_epi32(v, v), _mm_setzero_si128()); |
48 | 0 | int32_t t = _mm_cvtsi128_si32(b); |
49 | 0 | memcpy(p, &t, 4); |
50 | 0 | } |
51 | | |
52 | 0 | inline __m128i clip3(__m128i lo, __m128i hi, __m128i v) { |
53 | 0 | return _mm_min_epi32(_mm_max_epi32(v, lo), hi); |
54 | 0 | } |
55 | 0 | inline __m128i clip_u8(__m128i v) { |
56 | 0 | return _mm_min_epi32(_mm_max_epi32(v, _mm_setzero_si128()), _mm_set1_epi32(255)); |
57 | 0 | } |
58 | 0 | inline __m128i x2(__m128i a){ return _mm_add_epi32(a,a); } |
59 | 0 | inline __m128i add3(__m128i a,__m128i b,__m128i c){ return _mm_add_epi32(_mm_add_epi32(a,b),c); } |
60 | | |
61 | | // --- vertical: load/store transpose a 4(lines) x 8(samples) block ---------- |
62 | | |
63 | 0 | inline void load_vert(const uint8_t* ptr, ptrdiff_t stride, __m128i s[8]) { |
64 | 0 | __m128i r0 = _mm_loadl_epi64((const __m128i*)(ptr-4)); |
65 | 0 | __m128i r1 = _mm_loadl_epi64((const __m128i*)(ptr-4+stride)); |
66 | 0 | __m128i r2 = _mm_loadl_epi64((const __m128i*)(ptr-4+2*stride)); |
67 | 0 | __m128i r3 = _mm_loadl_epi64((const __m128i*)(ptr-4+3*stride)); |
68 | 0 | __m128i e = _mm_unpacklo_epi8(r0, r1); |
69 | 0 | __m128i f = _mm_unpacklo_epi8(r2, r3); |
70 | 0 | __m128i lo = _mm_unpacklo_epi16(e, f); // samples p3 p2 p1 p0 (4 bytes each, 4 lines) |
71 | 0 | __m128i hi = _mm_unpackhi_epi16(e, f); // samples q0 q1 q2 q3 |
72 | 0 | s[0]=_mm_cvtepu8_epi32(lo); |
73 | 0 | s[1]=_mm_cvtepu8_epi32(_mm_srli_si128(lo,4)); |
74 | 0 | s[2]=_mm_cvtepu8_epi32(_mm_srli_si128(lo,8)); |
75 | 0 | s[3]=_mm_cvtepu8_epi32(_mm_srli_si128(lo,12)); |
76 | 0 | s[4]=_mm_cvtepu8_epi32(hi); |
77 | 0 | s[5]=_mm_cvtepu8_epi32(_mm_srli_si128(hi,4)); |
78 | 0 | s[6]=_mm_cvtepu8_epi32(_mm_srli_si128(hi,8)); |
79 | 0 | s[7]=_mm_cvtepu8_epi32(_mm_srli_si128(hi,12)); |
80 | 0 | } |
81 | | |
82 | 0 | inline void store_vert(uint8_t* ptr, ptrdiff_t stride, const __m128i s[8]) { |
83 | 0 | __m128i lo = _mm_packus_epi16(_mm_packus_epi32(s[0],s[1]), _mm_packus_epi32(s[2],s[3])); |
84 | 0 | __m128i hi = _mm_packus_epi16(_mm_packus_epi32(s[4],s[5]), _mm_packus_epi32(s[6],s[7])); |
85 | 0 | const __m128i base = _mm_setr_epi8(0,4,8,12, (char)0x80,(char)0x80,(char)0x80,(char)0x80, |
86 | 0 | (char)0x80,(char)0x80,(char)0x80,(char)0x80, |
87 | 0 | (char)0x80,(char)0x80,(char)0x80,(char)0x80); |
88 | 0 | for (int k=0;k<4;k++) { |
89 | 0 | __m128i mask = _mm_add_epi8(base, _mm_set1_epi8((char)k)); // {k,4+k,8+k,12+k, 0x80..} |
90 | 0 | __m128i a = _mm_shuffle_epi8(lo, mask); // low4 = s0..s3 of line k |
91 | 0 | __m128i b = _mm_shuffle_epi8(hi, mask); // low4 = s4..s7 of line k |
92 | 0 | __m128i row = _mm_unpacklo_epi32(a, b); // low8 = the 8 samples of line k |
93 | 0 | _mm_storel_epi64((__m128i*)(ptr-4+k*stride), row); |
94 | 0 | } |
95 | 0 | } |
96 | | |
97 | 0 | inline void load_horiz(const uint8_t* ptr, ptrdiff_t stride, __m128i s[8]) { |
98 | 0 | s[0]=load4(ptr-4*stride); s[1]=load4(ptr-3*stride); s[2]=load4(ptr-2*stride); s[3]=load4(ptr-1*stride); |
99 | 0 | s[4]=load4(ptr+0*stride); s[5]=load4(ptr+1*stride); s[6]=load4(ptr+2*stride); s[7]=load4(ptr+3*stride); |
100 | 0 | } |
101 | 0 | inline void store_horiz(uint8_t* ptr, ptrdiff_t stride, const __m128i s[8]) { |
102 | 0 | store4(ptr-4*stride,s[0]); store4(ptr-3*stride,s[1]); store4(ptr-2*stride,s[2]); store4(ptr-1*stride,s[3]); |
103 | 0 | store4(ptr+0*stride,s[4]); store4(ptr+1*stride,s[5]); store4(ptr+2*stride,s[6]); store4(ptr+3*stride,s[7]); |
104 | 0 | } |
105 | | |
106 | | } // namespace |
107 | | |
108 | | |
109 | | void deblock_luma_8_sse4(uint8_t* ptr, ptrdiff_t stride, int vertical, |
110 | | int dE, int dEp, int dEq, int tc, int filterP, int filterQ) |
111 | 0 | { |
112 | 0 | __m128i s[8]; |
113 | 0 | if (vertical) load_vert(ptr, stride, s); else load_horiz(ptr, stride, s); |
114 | |
|
115 | 0 | const __m128i p3=s[0], p2=s[1], p1=s[2], p0=s[3]; |
116 | 0 | const __m128i q0=s[4], q1=s[5], q2=s[6], q3=s[7]; |
117 | |
|
118 | 0 | if (dE==2) { |
119 | | // strong filtering |
120 | 0 | const __m128i v2tc = _mm_set1_epi32(2*tc); |
121 | 0 | const __m128i c4 = _mm_set1_epi32(4); |
122 | 0 | const __m128i c2 = _mm_set1_epi32(2); |
123 | |
|
124 | 0 | __m128i pn0 = _mm_srai_epi32(_mm_add_epi32(add3(p2, x2(p1), x2(p0)), add3(x2(q0), q1, c4)), 3); |
125 | 0 | pn0 = clip3(_mm_sub_epi32(p0,v2tc), _mm_add_epi32(p0,v2tc), pn0); |
126 | 0 | __m128i pn1 = _mm_srai_epi32(_mm_add_epi32(add3(p2,p1,p0), _mm_add_epi32(q0,c2)), 2); |
127 | 0 | pn1 = clip3(_mm_sub_epi32(p1,v2tc), _mm_add_epi32(p1,v2tc), pn1); |
128 | 0 | __m128i pn2 = _mm_srai_epi32(_mm_add_epi32(add3(x2(p3), _mm_add_epi32(x2(p2),p2), p1), add3(p0,q0,c4)), 3); |
129 | 0 | pn2 = clip3(_mm_sub_epi32(p2,v2tc), _mm_add_epi32(p2,v2tc), pn2); |
130 | |
|
131 | 0 | __m128i qn0 = _mm_srai_epi32(_mm_add_epi32(add3(p1, x2(p0), x2(q0)), add3(x2(q1), q2, c4)), 3); |
132 | 0 | qn0 = clip3(_mm_sub_epi32(q0,v2tc), _mm_add_epi32(q0,v2tc), qn0); |
133 | 0 | __m128i qn1 = _mm_srai_epi32(_mm_add_epi32(add3(p0,q0,q1), _mm_add_epi32(q2,c2)), 2); |
134 | 0 | qn1 = clip3(_mm_sub_epi32(q1,v2tc), _mm_add_epi32(q1,v2tc), qn1); |
135 | 0 | __m128i qn2 = _mm_srai_epi32(_mm_add_epi32(add3(p0,q0,q1), add3(_mm_add_epi32(x2(q2),q2), x2(q3), c4)), 3); |
136 | 0 | qn2 = clip3(_mm_sub_epi32(q2,v2tc), _mm_add_epi32(q2,v2tc), qn2); |
137 | |
|
138 | 0 | if (filterP) { s[3]=pn0; s[2]=pn1; s[1]=pn2; } |
139 | 0 | if (filterQ) { s[4]=qn0; s[5]=qn1; s[6]=qn2; } |
140 | 0 | } |
141 | 0 | else { |
142 | | // weak filtering |
143 | 0 | const __m128i vtc = _mm_set1_epi32(tc); |
144 | 0 | const __m128i delta0 = _mm_srai_epi32( |
145 | 0 | _mm_add_epi32(_mm_sub_epi32(_mm_mullo_epi32(_mm_set1_epi32(9), _mm_sub_epi32(q0,p0)), |
146 | 0 | _mm_mullo_epi32(_mm_set1_epi32(3), _mm_sub_epi32(q1,p1))), |
147 | 0 | _mm_set1_epi32(8)), 4); |
148 | | // per-line mask: abs(delta) < tc*10 |
149 | 0 | __m128i mask = _mm_cmpgt_epi32(_mm_set1_epi32(tc*10), _mm_abs_epi32(delta0)); |
150 | 0 | __m128i delta = clip3(_mm_set1_epi32(-tc), vtc, delta0); |
151 | |
|
152 | 0 | if (filterP) { |
153 | 0 | __m128i p0n = clip_u8(_mm_add_epi32(p0, delta)); |
154 | 0 | s[3] = _mm_blendv_epi8(p0, p0n, mask); |
155 | 0 | } |
156 | 0 | if (filterQ) { |
157 | 0 | __m128i q0n = clip_u8(_mm_sub_epi32(q0, delta)); |
158 | 0 | s[4] = _mm_blendv_epi8(q0, q0n, mask); |
159 | 0 | } |
160 | 0 | if (dEp && filterP) { |
161 | 0 | const __m128i htc = _mm_set1_epi32(tc>>1); |
162 | 0 | __m128i dp = _mm_srai_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(p2,p0),_mm_set1_epi32(1)),1), p1), delta), 1); |
163 | 0 | dp = clip3(_mm_sub_epi32(_mm_setzero_si128(),htc), htc, dp); |
164 | 0 | __m128i p1n = clip_u8(_mm_add_epi32(p1, dp)); |
165 | 0 | s[2] = _mm_blendv_epi8(p1, p1n, mask); |
166 | 0 | } |
167 | 0 | if (dEq && filterQ) { |
168 | 0 | const __m128i htc = _mm_set1_epi32(tc>>1); |
169 | 0 | __m128i dq = _mm_srai_epi32(_mm_sub_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(q2,q0),_mm_set1_epi32(1)),1), q1), delta), 1); |
170 | 0 | dq = clip3(_mm_sub_epi32(_mm_setzero_si128(),htc), htc, dq); |
171 | 0 | __m128i q1n = clip_u8(_mm_add_epi32(q1, dq)); |
172 | 0 | s[5] = _mm_blendv_epi8(q1, q1n, mask); |
173 | 0 | } |
174 | 0 | } |
175 | |
|
176 | 0 | if (vertical) store_vert(ptr, stride, s); else store_horiz(ptr, stride, s); |
177 | 0 | } |
178 | | |
179 | | // Note: an SSE chroma deblock filter was implemented and benchmarked too, but |
180 | | // the chroma filter is a single delta per line -- so trivial that it is fully |
181 | | // load/store-bound (the vertical case needs a strided 2-column scatter), and |
182 | | // SSE measured slower than scalar (~0.5-0.9x). Chroma deblock therefore stays |
183 | | // on the scalar fallback; only luma is accelerated here. |
184 | | |
185 | | #endif // HAVE_SSE4_1 |