/work/libde265/libde265/x86/sse-intrapred.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "x86/sse-intrapred.h" |
22 | | #include "libde265/util.h" |
23 | | |
24 | | #ifdef HAVE_CONFIG_H |
25 | | #include "config.h" |
26 | | #endif |
27 | | |
28 | | #include <string.h> |
29 | | #include <emmintrin.h> // SSE2 |
30 | | |
31 | | #if HAVE_SSE4_1 |
32 | | #include <smmintrin.h> // SSE4.1 |
33 | | |
34 | | // angle / inverse-angle lookup tables, defined in intrapred.cc |
35 | | extern const int intraPredAngle_table[1+34]; |
36 | | extern const int invAngle_table[25-10]; |
37 | | |
38 | | namespace { |
39 | | |
40 | | const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE |
41 | | |
42 | | // load 4 / 8 consecutive uint8_t and zero-extend to 8x int16 |
43 | 0 | inline __m128i load4_epi16(const uint8_t* p) { |
44 | 0 | int32_t t; memcpy(&t, p, 4); |
45 | 0 | return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t)); |
46 | 0 | } |
47 | 0 | inline __m128i load8_epi16(const uint8_t* p) { |
48 | 0 | return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p)); |
49 | 0 | } |
50 | | |
51 | | // store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32}) |
52 | 0 | inline void store_row(uint8_t* d, __m128i v, int nT) { |
53 | 0 | switch (nT) { |
54 | 0 | case 4: { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break; |
55 | 0 | case 8: _mm_storel_epi64((__m128i*)d, v); break; |
56 | 0 | case 16: _mm_storeu_si128((__m128i*)d, v); break; |
57 | 0 | default: _mm_storeu_si128((__m128i*)d, v); |
58 | 0 | _mm_storeu_si128((__m128i*)(d+16), v); break; |
59 | 0 | } |
60 | 0 | } |
61 | | |
62 | | // copy exactly nT bytes (nT in {4,8,16,32}) |
63 | 0 | inline void copy_row(uint8_t* d, const uint8_t* s, int nT) { |
64 | 0 | switch (nT) { |
65 | 0 | case 4: memcpy(d, s, 4); break; |
66 | 0 | case 8: memcpy(d, s, 8); break; |
67 | 0 | case 16: _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((const __m128i*)s)); break; |
68 | 0 | default: _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((const __m128i*)s)); |
69 | 0 | _mm_storeu_si128((__m128i*)(d+16), _mm_loadu_si128((const __m128i*)(s+16))); break; |
70 | 0 | } |
71 | 0 | } |
72 | | |
73 | 0 | inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1 |
74 | | |
75 | | } // namespace |
76 | | |
77 | | |
78 | | void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border) |
79 | 0 | { |
80 | 0 | const int shift = shift_for(nT); |
81 | |
|
82 | 0 | int dcVal = 0; |
83 | 0 | for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; } |
84 | 0 | dcVal += nT; |
85 | 0 | dcVal >>= shift; |
86 | |
|
87 | 0 | const __m128i v = _mm_set1_epi8((char)dcVal); |
88 | 0 | for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT); |
89 | | |
90 | | // luma edge smoothing overwrites first row and first column (disjoint cells) |
91 | 0 | if (cIdx==0 && nT<32) { |
92 | 0 | dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2); |
93 | 0 | for (int x=1;x<nT;x++) dst[x] = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2); |
94 | 0 | for (int y=1;y<nT;y++) dst[y*stride] = (uint8_t)((border[-y-1] + 3*dcVal + 2) >> 2); |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | | |
99 | | void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border) |
100 | 0 | { |
101 | 0 | const int shift = shift_for(nT); |
102 | 0 | const int TR = border[ 1+nT]; // top-right corner sample |
103 | 0 | const int BL = border[-1-nT]; // bottom-left corner sample |
104 | |
|
105 | 0 | const __m128i base = _mm_setr_epi16(0,1,2,3,4,5,6,7); |
106 | 0 | const __m128i vTR = _mm_set1_epi16((short)TR); |
107 | 0 | const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1)); |
108 | 0 | const __m128i one = _mm_set1_epi16(1); |
109 | |
|
110 | 0 | for (int y=0;y<nT;y++) { |
111 | 0 | const int left_y = border[-1-y]; |
112 | 0 | const int Cy = (y+1)*BL + nT; // constant term for this row |
113 | 0 | const __m128i vL = _mm_set1_epi16((short)left_y); |
114 | 0 | const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y)); |
115 | 0 | const __m128i vC = _mm_set1_epi16((short)Cy); |
116 | |
|
117 | 0 | for (int x=0;x<nT;x+=8) { |
118 | 0 | const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base); |
119 | 0 | const __m128i vA = _mm_sub_epi16(vNTm1, xidx); // (nT-1-x) |
120 | 0 | const __m128i vB = _mm_add_epi16(xidx, one); // (x+1) |
121 | 0 | const __m128i top = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x); |
122 | |
|
123 | 0 | __m128i acc = _mm_mullo_epi16(vA, vL); // (nT-1-x)*border[-1-y] |
124 | 0 | acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB, vTR)); // (x+1)*border[1+nT] |
125 | 0 | acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x] |
126 | 0 | acc = _mm_add_epi16(acc, vC); // (y+1)*border[-1-nT] + nT |
127 | 0 | acc = _mm_srli_epi16(acc, shift); |
128 | |
|
129 | 0 | const __m128i p = _mm_packus_epi16(acc, acc); |
130 | 0 | store_row(dst + y*stride + x, p, (nT<8)?nT:8); |
131 | 0 | } |
132 | 0 | } |
133 | 0 | } |
134 | | |
135 | | |
136 | | void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter, |
137 | | int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border) |
138 | 0 | { |
139 | 0 | const int intraPredAngle = intraPredAngle_table[mode]; |
140 | |
|
141 | 0 | uint8_t ref_mem[4*kMaxBlk+1]; |
142 | 0 | uint8_t* ref = &ref_mem[2*kMaxBlk]; |
143 | |
|
144 | 0 | if (mode >= 18) { |
145 | 0 | for (int x=0;x<=nT;x++) ref[x] = border[x]; |
146 | |
|
147 | 0 | if (intraPredAngle<0) { |
148 | 0 | const int invAngle = invAngle_table[mode-11]; |
149 | 0 | if (((nT*intraPredAngle)>>5) < -1) { |
150 | 0 | for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) |
151 | 0 | ref[x] = border[0-((x*invAngle+128)>>8)]; |
152 | 0 | } |
153 | 0 | } else { |
154 | 0 | for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x]; |
155 | 0 | } |
156 | |
|
157 | 0 | for (int y=0;y<nT;y++) { |
158 | 0 | const int iIdx = ((y+1)*intraPredAngle)>>5; |
159 | 0 | const int iFact = ((y+1)*intraPredAngle)&31; |
160 | 0 | const uint8_t* src = ref + iIdx + 1; |
161 | 0 | uint8_t* d = dst + y*stride; |
162 | |
|
163 | 0 | if (iFact==0) { |
164 | 0 | copy_row(d, src, nT); // dst[x] = ref[x+iIdx+1] |
165 | 0 | } else { |
166 | 0 | const __m128i w0 = _mm_set1_epi16((short)(32-iFact)); |
167 | 0 | const __m128i w1 = _mm_set1_epi16((short)iFact); |
168 | 0 | const __m128i r16 = _mm_set1_epi16(16); |
169 | 0 | if (nT==4) { |
170 | 0 | __m128i a = load4_epi16(src), b = load4_epi16(src+1); |
171 | 0 | __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16); |
172 | 0 | acc = _mm_srli_epi16(acc, 5); |
173 | 0 | store_row(d, _mm_packus_epi16(acc,acc), 4); |
174 | 0 | } else { |
175 | 0 | for (int x=0;x<nT;x+=8) { |
176 | 0 | __m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1); |
177 | 0 | __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16); |
178 | 0 | acc = _mm_srli_epi16(acc, 5); |
179 | 0 | store_row(d+x, _mm_packus_epi16(acc,acc), 8); |
180 | 0 | } |
181 | 0 | } |
182 | 0 | } |
183 | 0 | } |
184 | |
|
185 | 0 | if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) { |
186 | 0 | for (int y=0;y<nT;y++) |
187 | 0 | dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth); |
188 | 0 | } |
189 | 0 | } |
190 | 0 | else { |
191 | | // Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and |
192 | | // a row-indexed reference fetch), which does not map onto contiguous SIMD loads or |
193 | | // stores. Use the scalar reference path here -- kept bit-identical to |
194 | | // intra_prediction_angular() in intrapred.h. |
195 | 0 | for (int x=0;x<=nT;x++) ref[x] = border[-x]; |
196 | |
|
197 | 0 | if (intraPredAngle<0) { |
198 | 0 | const int invAngle = invAngle_table[mode-11]; |
199 | 0 | if (((nT*intraPredAngle)>>5) < -1) { |
200 | 0 | for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) |
201 | 0 | ref[x] = border[(x*invAngle+128)>>8]; |
202 | 0 | } |
203 | 0 | } else { |
204 | 0 | for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x]; |
205 | 0 | } |
206 | |
|
207 | 0 | for (int y=0;y<nT;y++) |
208 | 0 | for (int x=0;x<nT;x++) { |
209 | 0 | const int iIdx = ((x+1)*intraPredAngle)>>5; |
210 | 0 | const int iFact = ((x+1)*intraPredAngle)&31; |
211 | 0 | if (iFact != 0) |
212 | 0 | dst[x+y*stride] = (uint8_t)(((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5); |
213 | 0 | else |
214 | 0 | dst[x+y*stride] = ref[y+iIdx+1]; |
215 | 0 | } |
216 | |
|
217 | 0 | if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) { |
218 | 0 | for (int x=0;x<nT;x++) |
219 | 0 | dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth); |
220 | 0 | } |
221 | 0 | } |
222 | 0 | } |
223 | | |
224 | | #endif // HAVE_SSE4_1 |