/src/vvdec/source/Lib/CommonLib/x86/IntraPredX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | /** \file IntraPredX86.h |
44 | | \brief SIMD for IntraPrediction |
45 | | */ |
46 | | |
47 | | #include "CommonLib/CommonDef.h" |
48 | | #include "CommonDefX86.h" |
49 | | #include "CommonLib/IntraPrediction.h" |
50 | | |
51 | | namespace vvdec |
52 | | { |
53 | | |
54 | | #if ENABLE_SIMD_OPT_INTRAPRED |
55 | | #ifdef TARGET_SIMD_X86 |
56 | | |
57 | | //#define USE_AVX2 |
58 | | template< X86_VEXT vext, int W > |
59 | | void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle) |
60 | 0 | { |
61 | 0 | int deltaInt; |
62 | 0 | int deltaFract; |
63 | 0 | int refMainIndex; |
64 | |
|
65 | 0 | __m128i voffset = _mm_set1_epi16(16); |
66 | 0 | if( W == 8 ) |
67 | 0 | { |
68 | 0 | if( vext >= AVX2 ) |
69 | 0 | { |
70 | | #ifdef USE_AVX2 |
71 | 0 | if (( width & 15 ) == 0 ) |
72 | 0 | { |
73 | 0 | int deltaInt; |
74 | 0 | int deltaFract; |
75 | 0 | int refMainIndex; |
76 | | |
77 | | __m256i voffset = _mm256_set1_epi16(16); |
78 | 0 | for (int k=0; k<height; k++) { |
79 | |
|
80 | 0 | deltaInt = deltaPos >> 5; |
81 | 0 | deltaFract = deltaPos & (32 - 1); |
82 | |
|
83 | 0 | __m256i vfract = _mm256_set1_epi16(deltaFract); |
84 | 0 | __m256i v32minfract = _mm256_set1_epi16(32-deltaFract); |
85 | | // Do linear filtering |
86 | 0 | for (int l=0; l<width; l+=16) { |
87 | 0 | refMainIndex = l+ deltaInt+1; |
88 | 0 | __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]); |
89 | 0 | __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]); |
90 | 0 | vpred0 = _mm256_mullo_epi16(v32minfract, vpred0); |
91 | 0 | vpred1 = _mm256_mullo_epi16(vfract, vpred1); |
92 | 0 | __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5); |
93 | 0 | _mm256_storeu_si256((__m256i*)&pDst[l], vpred); |
94 | 0 | } |
95 | 0 | pDst+=dstStride; |
96 | 0 | deltaPos += intraPredAngle; |
97 | 0 | } |
98 | 0 | } |
99 | 0 | else // width==8 |
100 | 0 | { |
101 | 0 | for (int k=0; k<height; k++) |
102 | 0 | { |
103 | 0 | deltaInt = deltaPos >> 5; |
104 | 0 | deltaFract = deltaPos & (32 - 1); |
105 | |
|
106 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
107 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
108 | | // Do linear filtering |
109 | 0 | for (int l=0; l<width; l+=8) { |
110 | 0 | refMainIndex = l+ deltaInt+1; |
111 | 0 | __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]); |
112 | 0 | __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]); |
113 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
114 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
115 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
116 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); |
117 | 0 | } |
118 | 0 | deltaPos += intraPredAngle; |
119 | |
|
120 | 0 | pDst+=dstStride; |
121 | 0 | } |
122 | |
|
123 | 0 | } |
124 | | #endif //AVX2 |
125 | 0 | } |
126 | 0 | else |
127 | 0 | { |
128 | 0 | for (int k=0; k<height; k++) { |
129 | 0 | deltaInt = deltaPos >> 5; |
130 | 0 | deltaFract = deltaPos & (32 - 1); |
131 | |
|
132 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
133 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
134 | | // Do linear filtering |
135 | 0 | for (int l=0; l<width; l+=8) { |
136 | 0 | refMainIndex = l+ deltaInt+1; |
137 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
138 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
139 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
140 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
141 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
142 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); |
143 | 0 | } |
144 | 0 | deltaPos += intraPredAngle; |
145 | |
|
146 | 0 | pDst+=dstStride; |
147 | 0 | } |
148 | 0 | } |
149 | |
|
150 | 0 | } |
151 | 0 | else if( W == 4 ) |
152 | 0 | { |
153 | 0 | for (int k=0; k<height; k++) { |
154 | 0 | deltaInt = deltaPos >> 5; |
155 | 0 | deltaFract = deltaPos & (32 - 1); |
156 | |
|
157 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
158 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
159 | | // Do linear filtering |
160 | 0 | refMainIndex = deltaInt+1; |
161 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
162 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
163 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
164 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
165 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
166 | 0 | _mm_storeu_si64( ( __m128i * )(pDst ), vpred); |
167 | 0 | deltaPos += intraPredAngle; |
168 | 0 | pDst+=dstStride; |
169 | 0 | } |
170 | 0 | } |
171 | 0 | else |
172 | 0 | { |
173 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); |
174 | 0 | } |
175 | | #if USE_AVX2 |
176 | | |
177 | 0 | _mm256_zeroupper(); |
178 | 0 | #endif |
179 | 0 | } Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int) Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int) Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int) Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int) |
180 | | |
181 | | |
182 | | template< X86_VEXT vext, int W > |
183 | | void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff,const bool useCubicFilter,const ClpRng& clpRng) |
184 | 0 | { |
185 | 0 | int16_t* pDst; |
186 | |
|
187 | 0 | if( W == 8 ) |
188 | 0 | { |
189 | 0 | if( vext >= AVX2 ) |
190 | 0 | { |
191 | | #ifdef USE_AVX2 |
192 | | __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, |
193 | | 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
194 | | __m256i offset = _mm256_set1_epi32( 32 ); |
195 | | |
196 | 0 | if (( width & 15 ) == 0 ) |
197 | 0 | { |
198 | 0 | __m256i vbdmin,vbdmax; |
199 | |
|
200 | 0 | if (useCubicFilter) |
201 | 0 | { |
202 | 0 | vbdmin = _mm256_set1_epi16( clpRng.min() ); |
203 | 0 | vbdmax = _mm256_set1_epi16( clpRng.max() ); |
204 | 0 | } |
205 | |
|
206 | 0 | for (int y = 0; y<height; y++ ) |
207 | 0 | { |
208 | 0 | int deltaInt = deltaPos >> 5; |
209 | 0 | int deltaFract = deltaPos & (32 - 1); |
210 | 0 | int refMainIndex = deltaInt + 1; |
211 | 0 | pDst=&pDstBuf[y*dstStride]; |
212 | 0 | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
213 | 0 | tmp = _mm_shuffle_epi32(tmp,0x44); |
214 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(tmp); |
215 | 0 | for( int x = 0; x < width; x+=16) |
216 | 0 | { |
217 | 0 | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex - 1] ) ); |
218 | 0 | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) ); |
219 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
220 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
221 | |
|
222 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); |
223 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); |
224 | |
|
225 | 0 | __m256i sum = _mm256_hadd_epi32( src1, src2 ); |
226 | 0 | sum = _mm256_permute4x64_epi64(sum,0xD8); |
227 | |
|
228 | 0 | sum = _mm256_add_epi32( sum, offset ); |
229 | 0 | sum = _mm256_srai_epi32( sum, 6 ); |
230 | |
|
231 | 0 | refMainIndex+=8; |
232 | | |
233 | 0 | src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex - 1] ) ); |
234 | 0 | src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) ); |
235 | |
|
236 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
237 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
238 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); |
239 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); |
240 | |
|
241 | 0 | __m256i sum1 = _mm256_hadd_epi32( src1, src2 ); |
242 | 0 | sum1 = _mm256_permute4x64_epi64(sum1,0xD8); |
243 | |
|
244 | 0 | sum1 = _mm256_add_epi32( sum1, offset ); |
245 | 0 | sum1 = _mm256_srai_epi32( sum1, 6 ); |
246 | 0 | __m256i |
247 | 0 | src0 = _mm256_packs_epi32( sum, sum1 ); |
248 | |
|
249 | 0 | src0 = _mm256_permute4x64_epi64(src0,0xD8); |
250 | |
|
251 | 0 | refMainIndex+=8; |
252 | |
|
253 | 0 | if (useCubicFilter) |
254 | 0 | src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) ); |
255 | |
|
256 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst + x), src0); |
257 | 0 | } |
258 | 0 | deltaPos += intraPredAngle; |
259 | 0 | } |
260 | 0 | } |
261 | 0 | else // width =8 |
262 | 0 | { |
263 | | // printf("AVX2 Block %d \n",width); |
264 | 0 | __m128i vbdmin,vbdmax; |
265 | |
|
266 | 0 | if (useCubicFilter) |
267 | 0 | { |
268 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); |
269 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); |
270 | 0 | } |
271 | |
|
272 | 0 | for (int y = 0; y<height; y++ ) |
273 | 0 | { |
274 | 0 | int deltaInt = deltaPos >> 5; |
275 | 0 | int deltaFract = deltaPos & (32 - 1); |
276 | 0 | int refMainIndex = deltaInt + 1; |
277 | 0 | pDst=&pDstBuf[y*dstStride]; |
278 | 0 | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
279 | 0 | tmp = _mm_shuffle_epi32(tmp,0x44); |
280 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(tmp); |
281 | 0 | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) ); |
282 | 0 | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) ); |
283 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
284 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
285 | |
|
286 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); |
287 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); |
288 | |
|
289 | 0 | __m256i sum = _mm256_hadd_epi32( src1, src2 ); |
290 | 0 | sum = _mm256_permute4x64_epi64(sum,0xD8); |
291 | |
|
292 | 0 | sum = _mm256_add_epi32( sum, offset ); |
293 | 0 | sum = _mm256_srai_epi32( sum, 6 ); |
294 | 0 | __m128i dest128 = _mm256_cvtepi32_epi16x( sum ); |
295 | |
|
296 | 0 | if (useCubicFilter) |
297 | 0 | dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) ); |
298 | |
|
299 | 0 | _mm_storeu_si128( ( __m128i * )(pDst), dest128); |
300 | 0 | deltaPos += intraPredAngle; |
301 | 0 | } |
302 | 0 | } |
303 | | #endif |
304 | 0 | } |
305 | 0 | else |
306 | 0 | { |
307 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
308 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); |
309 | 0 | __m128i vbdmin,vbdmax; |
310 | |
|
311 | 0 | __m128i offset = _mm_set1_epi32( 32 ); |
312 | |
|
313 | 0 | if (useCubicFilter) |
314 | 0 | { |
315 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); |
316 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); |
317 | 0 | } |
318 | |
|
319 | 0 | for (int y = 0; y<height; y++ ) |
320 | 0 | { |
321 | 0 | int deltaInt = deltaPos >> 5; |
322 | 0 | int deltaFract = deltaPos & (32 - 1); |
323 | 0 | int refMainIndex = deltaInt + 1; |
324 | 0 | pDst=&pDstBuf[y*dstStride]; |
325 | 0 | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
326 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); |
327 | 0 | for( int x = 0; x < width; x+=8) |
328 | 0 | { |
329 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
330 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
331 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 |
332 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
333 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
334 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); |
335 | 0 | sum = _mm_add_epi32( sum, offset ); |
336 | 0 | sum = _mm_srai_epi32( sum, 6 ); |
337 | |
|
338 | 0 | refMainIndex+=4; |
339 | 0 | src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
340 | 0 | src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
341 | 0 | src2 = _mm_shuffle_epi8(src0,shflmask2); |
342 | | |
343 | | // 1 2 3 4 2 3 4 5 |
344 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
345 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
346 | |
|
347 | 0 | __m128i sum1 = _mm_hadd_epi32( src0, src1 ); |
348 | 0 | sum1 = _mm_add_epi32( sum1, offset ); |
349 | 0 | sum1 = _mm_srai_epi32( sum1, 6 ); |
350 | 0 | src0 = _mm_packs_epi32( sum, sum1 ); |
351 | |
|
352 | 0 | refMainIndex+=4; |
353 | |
|
354 | 0 | if (useCubicFilter) |
355 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); |
356 | |
|
357 | 0 | _mm_storeu_si128( ( __m128i * )(pDst + x), src0); |
358 | |
|
359 | 0 | } |
360 | 0 | deltaPos += intraPredAngle; |
361 | 0 | } |
362 | 0 | } |
363 | 0 | } |
364 | 0 | else if( W == 4 ) |
365 | 0 | { |
366 | |
|
367 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
368 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); |
369 | 0 | __m128i vbdmin,vbdmax; |
370 | |
|
371 | 0 | __m128i offset = _mm_set1_epi32( 32 ); |
372 | |
|
373 | 0 | if (useCubicFilter) |
374 | 0 | { |
375 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); |
376 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); |
377 | 0 | } |
378 | |
|
379 | 0 | for (int y = 0; y<height; y++ ) |
380 | 0 | { |
381 | 0 | int deltaInt = deltaPos >> 5; |
382 | 0 | int deltaFract = deltaPos & (32 - 1); |
383 | 0 | int refMainIndex = deltaInt + 1; |
384 | 0 | pDst=&pDstBuf[y*dstStride]; |
385 | 0 | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
386 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); |
387 | 0 | { |
388 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
389 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
390 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 |
391 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
392 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
393 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); |
394 | 0 | sum = _mm_add_epi32( sum, offset ); |
395 | 0 | sum = _mm_srai_epi32( sum, 6 ); |
396 | |
|
397 | 0 | src0 = _mm_packs_epi32( sum, sum ); |
398 | |
|
399 | 0 | refMainIndex+=4; |
400 | |
|
401 | 0 | if (useCubicFilter) |
402 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); |
403 | |
|
404 | 0 | _mm_storeu_si64( ( __m128i * )(pDst ), src0); |
405 | |
|
406 | 0 | } |
407 | 0 | deltaPos += intraPredAngle; |
408 | 0 | } |
409 | 0 | } |
410 | 0 | else |
411 | 0 | { |
412 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); |
413 | 0 | } |
414 | | #if USE_AVX2 |
415 | 0 | _mm256_zeroupper(); |
416 | 0 | #endif |
417 | 0 | } Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) |
418 | | |
419 | | template< X86_VEXT vext, int W > |
420 | | void IntraPredSampleFilter_SIMD(Pel *ptrSrc,const ptrdiff_t srcStride,PelBuf &piPred,const uint32_t uiDirMode,const ClpRng& clpRng) |
421 | 0 | { |
422 | 0 | const int iWidth = piPred.width; |
423 | 0 | const int iHeight = piPred.height; |
424 | 0 | PelBuf dstBuf = piPred; |
425 | 0 | Pel* pDst = dstBuf.buf; |
426 | 0 | const ptrdiff_t dstStride = dstBuf.stride; |
427 | |
|
428 | 0 | const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2); |
429 | 0 | CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2"); |
430 | |
|
431 | | #if USE_AVX2 |
432 | 0 | if( W > 8 ) |
433 | 0 | { |
434 | 0 | __m256i tmplo,tmphi; |
435 | 0 | __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64); |
436 | 0 | __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32); |
437 | 0 | __m256i vbdmin = _mm256_set1_epi32( clpRng.min() ); |
438 | 0 | __m256i vbdmax = _mm256_set1_epi32( clpRng.max() ); |
439 | 0 | __m256i wl16; |
440 | 0 | __m256i wl16start; |
441 | | |
442 | 0 | if (scale==0) |
443 | 0 | { |
444 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); |
445 | 0 | } |
446 | 0 | else if (scale==1) |
447 | 0 | { |
448 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); |
449 | 0 | } |
450 | 0 | else if (scale==2) |
451 | 0 | { |
452 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); |
453 | 0 | } |
454 | 0 | else |
455 | 0 | { |
456 | 0 | THROW_FATAL( "Wrong scale (" << scale << ")" ); |
457 | 0 | } |
458 | | |
459 | | |
460 | 0 | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) |
461 | 0 | { |
462 | 0 | for (int y = 0; y < iHeight; y++) |
463 | 0 | { |
464 | 0 | int wT = 32 >> std::min(31, ((y << 1) >> scale)); |
465 | |
|
466 | 0 | __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT); |
467 | 0 | __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); |
468 | |
|
469 | 0 | if (wT) |
470 | 0 | { |
471 | 0 | for (int x = 0; x < iWidth; x+=16) |
472 | 0 | { |
473 | 0 | if (x==0) |
474 | 0 | { |
475 | 0 | wl16=wl16start; |
476 | |
|
477 | 0 | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top |
478 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst |
479 | |
|
480 | 0 | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left |
481 | 0 | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left |
482 | 0 | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
483 | 0 | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
484 | |
|
485 | 0 | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top |
486 | 0 | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top |
487 | 0 | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); |
488 | 0 | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); |
489 | |
|
490 | 0 | __m256i wX = _mm256_sub_epi16(w64,wl16); |
491 | 0 | wX = _mm256_sub_epi16(wX,wt16); // 64-wL-wT |
492 | 0 | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst |
493 | 0 | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst |
494 | 0 | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
495 | 0 | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
496 | |
|
497 | 0 | dstlo = _mm256_add_epi32(dstlo,toplo); |
498 | 0 | dsthi = _mm256_add_epi32(dsthi,tophi); |
499 | 0 | dstlo = _mm256_add_epi32(dstlo,leftlo); |
500 | 0 | dsthi = _mm256_add_epi32(dsthi,lefthi); |
501 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); |
502 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); |
503 | |
|
504 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); |
505 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); |
506 | |
|
507 | 0 | dstlo = _mm256_max_epi32(vbdmin,dstlo); |
508 | 0 | dsthi = _mm256_max_epi32(vbdmin,dsthi); |
509 | 0 | dstlo = _mm256_min_epi32(vbdmax,dstlo); |
510 | 0 | dsthi = _mm256_min_epi32(vbdmax,dsthi); |
511 | |
|
512 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
513 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
514 | |
|
515 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); |
516 | 0 | } |
517 | 0 | else |
518 | 0 | { |
519 | |
|
520 | 0 | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top |
521 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst |
522 | | |
523 | |
|
524 | 0 | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top |
525 | 0 | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top |
526 | 0 | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); |
527 | 0 | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); |
528 | |
|
529 | 0 | __m256i wX = _mm256_sub_epi16(w64,wt16); |
530 | 0 | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst |
531 | 0 | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst |
532 | 0 | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
533 | 0 | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
534 | |
|
535 | 0 | dstlo = _mm256_add_epi32(dstlo,toplo); |
536 | 0 | dsthi = _mm256_add_epi32(dsthi,tophi); |
537 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); |
538 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); |
539 | |
|
540 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); |
541 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); |
542 | |
|
543 | 0 | dstlo = _mm256_max_epi32(vbdmin,dstlo); |
544 | 0 | dsthi = _mm256_max_epi32(vbdmin,dsthi); |
545 | 0 | dstlo = _mm256_min_epi32(vbdmax,dstlo); |
546 | 0 | dsthi = _mm256_min_epi32(vbdmax,dsthi); |
547 | |
|
548 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
549 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
550 | |
|
551 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); |
552 | 0 | } |
553 | |
|
554 | 0 | } // for |
555 | 0 | } |
556 | 0 | else |
557 | 0 | { // wT =0 |
558 | |
|
559 | 0 | wl16=wl16start; |
560 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst |
561 | |
|
562 | 0 | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left |
563 | 0 | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left |
564 | 0 | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
565 | 0 | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
566 | |
|
567 | 0 | __m256i wX = _mm256_sub_epi16(w64,wl16); |
568 | 0 | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst |
569 | 0 | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst |
570 | 0 | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
571 | 0 | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
572 | |
|
573 | 0 | dstlo = _mm256_add_epi32(dstlo,leftlo); |
574 | 0 | dsthi = _mm256_add_epi32(dsthi,lefthi); |
575 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); |
576 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); |
577 | |
|
578 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); |
579 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); |
580 | |
|
581 | 0 | dstlo = _mm256_max_epi32(vbdmin,dstlo); |
582 | 0 | dsthi = _mm256_max_epi32(vbdmin,dsthi); |
583 | 0 | dstlo = _mm256_min_epi32(vbdmax,dstlo); |
584 | 0 | dsthi = _mm256_min_epi32(vbdmax,dsthi); |
585 | |
|
586 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
587 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
588 | |
|
589 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo ); |
590 | 0 | } |
591 | 0 | } |
592 | 0 | } |
593 | 0 | } |
594 | 0 | else |
595 | 0 | #endif |
596 | 0 | { |
597 | 0 | __m128i tmplo8,tmphi8; |
598 | 0 | __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64); |
599 | 0 | __m128i w32_8 = _mm_set_epi32(32,32,32,32); |
600 | 0 | __m128i vbdmin8 = _mm_set1_epi32( clpRng.min() ); |
601 | 0 | __m128i vbdmax8 = _mm_set1_epi32( clpRng.max() ); |
602 | 0 | __m128i wl8start,wl8start2; |
603 | 0 | CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2"); |
604 | |
|
605 | 0 | if (scale==0) |
606 | 0 | { |
607 | 0 | wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32); |
608 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); |
609 | 0 | } |
610 | 0 | else if (scale==1) |
611 | 0 | { |
612 | 0 | wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32); |
613 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); |
614 | 0 | } |
615 | 0 | else if (scale==2) |
616 | 0 | { |
617 | 0 | wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32); |
618 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2); |
619 | 0 | } |
620 | 0 | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) |
621 | 0 | { |
622 | 0 | __m128i wl8 = wl8start; |
623 | 0 | for (int y = 0; y < iHeight; y++) |
624 | 0 | { |
625 | 0 | int wT = 32 >> std::min(31, ((y << 1) >> scale)); |
626 | |
|
627 | 0 | __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT); |
628 | | // __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); |
629 | |
|
630 | 0 | __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))); |
631 | 0 | x8left =_mm_shufflelo_epi16(x8left,0); |
632 | 0 | x8left =_mm_shuffle_epi32(x8left,0); |
633 | | |
634 | |
|
635 | 0 | if (wT) |
636 | 0 | { |
637 | 0 | for (int x = 0; x < iWidth; x+=8) |
638 | 0 | { |
639 | 0 | __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) ); // load top |
640 | 0 | __m128i x8dst = _mm_setzero_si128(); |
641 | 0 | if( iWidth >= 8 ) |
642 | 0 | x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst |
643 | 0 | else if( iWidth == 4 ) |
644 | 0 | x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst |
645 | 0 | else if( iWidth == 2 ) |
646 | 0 | x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst |
647 | 0 | else |
648 | 0 | { |
649 | 0 | CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); |
650 | 0 | } |
651 | | |
652 | 0 | if (x>8) |
653 | 0 | { |
654 | 0 | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top |
655 | 0 | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top |
656 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
657 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
658 | | |
659 | |
|
660 | 0 | __m128i wX = _mm_sub_epi16(w64_8,wt8); |
661 | 0 | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst |
662 | 0 | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst |
663 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
664 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
665 | |
|
666 | 0 | dstlo8 = _mm_add_epi32(dstlo8,toplo8); |
667 | 0 | dsthi8 = _mm_add_epi32(dsthi8,tophi8); |
668 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); |
669 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); |
670 | |
|
671 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); |
672 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); |
673 | |
|
674 | 0 | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); |
675 | 0 | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); |
676 | 0 | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); |
677 | 0 | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); |
678 | |
|
679 | 0 | x8dst = _mm_packs_epi32(dstlo8,dsthi8); |
680 | 0 | } |
681 | 0 | else // x<=8 |
682 | 0 | { |
683 | 0 | if (x==0) |
684 | 0 | wl8=wl8start; |
685 | 0 | else if (x==8) |
686 | 0 | wl8=wl8start2; |
687 | |
|
688 | 0 | tmplo8 = _mm_mullo_epi16(x8left,wl8); //wL * left |
689 | 0 | tmphi8 = _mm_mulhi_epi16(x8left,wl8); //wL * left |
690 | 0 | __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
691 | 0 | __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
692 | |
|
693 | 0 | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top |
694 | 0 | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top |
695 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
696 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
697 | |
|
698 | 0 | __m128i wX = _mm_sub_epi16(w64_8,wl8); |
699 | 0 | wX = _mm_sub_epi16(wX,wt8); // 64-wL-wT |
700 | 0 | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst |
701 | 0 | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst |
702 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
703 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
704 | |
|
705 | 0 | dstlo8 = _mm_add_epi32(dstlo8,toplo8); |
706 | 0 | dsthi8 = _mm_add_epi32(dsthi8,tophi8); |
707 | 0 | dstlo8 = _mm_add_epi32(dstlo8,leftlo8); |
708 | 0 | dsthi8 = _mm_add_epi32(dsthi8,lefthi8); |
709 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); |
710 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); |
711 | |
|
712 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); |
713 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); |
714 | |
|
715 | 0 | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); |
716 | 0 | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); |
717 | 0 | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); |
718 | 0 | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); |
719 | |
|
720 | 0 | x8dst = _mm_packs_epi32(dstlo8,dsthi8); |
721 | 0 | } |
722 | |
|
723 | 0 | if( iWidth >= 8 ) |
724 | 0 | _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); |
725 | 0 | else if( iWidth == 4 ) |
726 | 0 | _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); |
727 | 0 | else if( iWidth == 2 ) |
728 | 0 | _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); |
729 | 0 | } |
730 | 0 | } |
731 | 0 | else //wT =0 |
732 | 0 | { |
733 | 0 | for( int x = 0; x < std::min( iWidth, 16 ); x += 8 ) |
734 | 0 | { |
735 | 0 | if( x == 0 ) |
736 | 0 | wl8 = wl8start; |
737 | 0 | else if( x == 8 ) |
738 | 0 | wl8 = wl8start2; |
739 | |
|
740 | 0 | __m128i x8dst; |
741 | 0 | if( iWidth >= 8 ) |
742 | 0 | x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst |
743 | 0 | else if( iWidth == 4 ) |
744 | 0 | x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst |
745 | 0 | else if( iWidth == 2 ) |
746 | 0 | x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst |
747 | 0 | else |
748 | 0 | CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); |
749 | | |
750 | |
|
751 | 0 | tmplo8 = _mm_mullo_epi16( x8left, wl8 ); // wL * left |
752 | 0 | tmphi8 = _mm_mulhi_epi16( x8left, wl8 ); // wL * left |
753 | 0 | __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); |
754 | 0 | __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); |
755 | |
|
756 | 0 | __m128i wX = _mm_sub_epi16( w64_8, wl8 ); |
757 | 0 | tmplo8 = _mm_mullo_epi16( x8dst, wX ); // 64-wL-wT*dst |
758 | 0 | tmphi8 = _mm_mulhi_epi16( x8dst, wX ); // 64-wL-wT*dst |
759 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); |
760 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); |
761 | |
|
762 | 0 | dstlo8 = _mm_add_epi32( dstlo8, leftlo8 ); |
763 | 0 | dsthi8 = _mm_add_epi32( dsthi8, lefthi8 ); |
764 | 0 | dstlo8 = _mm_add_epi32( dstlo8, w32_8 ); |
765 | 0 | dsthi8 = _mm_add_epi32( dsthi8, w32_8 ); |
766 | |
|
767 | 0 | dstlo8 = _mm_srai_epi32( dstlo8, 6 ); |
768 | 0 | dsthi8 = _mm_srai_epi32( dsthi8, 6 ); |
769 | |
|
770 | 0 | dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 ); |
771 | 0 | dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 ); |
772 | 0 | dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 ); |
773 | 0 | dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 ); |
774 | |
|
775 | 0 | dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 ); |
776 | |
|
777 | 0 | if( iWidth >= 8 ) |
778 | 0 | _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); |
779 | 0 | else if( iWidth == 4 ) |
780 | 0 | _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) ); |
781 | 0 | else if( iWidth == 2 ) |
782 | 0 | _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); |
783 | 0 | } |
784 | 0 | } |
785 | 0 | } |
786 | 0 | } |
787 | 0 | } |
788 | | |
789 | | |
790 | | #if USE_AVX2 |
791 | 0 | _mm256_zeroupper(); |
792 | 0 | #endif |
793 | 0 | } Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) |
794 | | |
795 | | /** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding). |
796 | | */ |
797 | | template< X86_VEXT vext> |
798 | | void xPredIntraPlanar_SIMD( const CPelBuf &pSrc, PelBuf &pDst, const SPS& sps ) |
799 | 0 | { |
800 | |
|
801 | 0 | const uint32_t width = pDst.width; |
802 | 0 | const uint32_t height = pDst.height; |
803 | 0 | const uint32_t log2W = getLog2( width ); |
804 | 0 | const uint32_t log2H = getLog2( height ); |
805 | 0 | const uint32_t finalShift = 1 + log2W + log2H; |
806 | 0 | const uint32_t offset = 1 << (log2W + log2H); |
807 | 0 | const ptrdiff_t stride = pDst.stride; |
808 | 0 | Pel* pred = pDst.buf; |
809 | |
|
810 | 0 | const Pel *ptrSrc =pSrc.buf; |
811 | |
|
812 | 0 | int leftColumn,rightColumn; |
813 | 0 | Pel tmp; |
814 | 0 | int topRight = pSrc.at( width + 1, 0 ); |
815 | |
|
816 | 0 | tmp=pSrc.at( 0, height+1 ); |
817 | 0 | const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp); |
818 | 0 | const __m128i zero = _mm_setzero_si128(); |
819 | 0 | const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8); |
820 | 0 | const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset); |
821 | 0 | const __m128i vLog2W = _mm_cvtsi32_si128(log2W); |
822 | 0 | const __m128i vLog2H = _mm_cvtsi32_si128(log2H); |
823 | 0 | const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift); |
824 | |
|
825 | 0 | for( int y = 0; y < height; y++) |
826 | 0 | { |
827 | 0 | leftColumn=pSrc.at( 0, y + 1 ); |
828 | 0 | rightColumn = topRight - leftColumn; |
829 | 0 | leftColumn = leftColumn << log2W; |
830 | 0 | const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn); |
831 | 0 | const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn); |
832 | 0 | const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1); |
833 | 0 | __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1); |
834 | |
|
835 | 0 | for( int x = 0; x < width; x+=8 ) |
836 | 0 | { |
837 | | //topRow[x] = pSrc.at( x + 1, 0 ); |
838 | 0 | __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1))); |
839 | | //bottomRow[x] = bottomLeft - topRow[x]; |
840 | 0 | __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16); |
841 | | // (y+1)*bottomRow[x] |
842 | 0 | __m128i tmpH = _mm_mulhi_epi16(bottomRow16L,y16); |
843 | 0 | __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16); |
844 | 0 | bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH); |
845 | 0 | __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH); |
846 | | |
847 | | // (topRow[x] topRow16H<< log2H) |
848 | 0 | __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero); |
849 | 0 | __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero); |
850 | 0 | topRow32L = _mm_sll_epi32(topRow32L,vLog2H); |
851 | 0 | topRow32H = _mm_sll_epi32(topRow32H,vLog2H); |
852 | | // vertPred = (topRow[x] << log2H) + (y+1)*bottomRow[x]; |
853 | 0 | topRow32L = _mm_add_epi32(topRow32L,bottomRow16L); |
854 | 0 | topRow32H = _mm_add_epi32(topRow32H,bottomRow16H); |
855 | | // horPred = leftColumn + (x+1)*rightColumn; |
856 | 0 | tmpL = _mm_mullo_epi16(rightcolumn16,x16); |
857 | 0 | tmpH = _mm_mulhi_epi16(rightcolumn16,x16); |
858 | 0 | __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH); |
859 | 0 | __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH); |
860 | 0 | horpred32L = _mm_add_epi32(leftColumn32,horpred32L); |
861 | 0 | horpred32H = _mm_add_epi32(leftColumn32,horpred32H); |
862 | | // pred[x] = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift; |
863 | 0 | horpred32L = _mm_sll_epi32(horpred32L,vLog2H); |
864 | 0 | horpred32H = _mm_sll_epi32(horpred32H,vLog2H); |
865 | 0 | topRow32L = _mm_sll_epi32(topRow32L,vLog2W); |
866 | 0 | topRow32H = _mm_sll_epi32(topRow32H,vLog2W); |
867 | 0 | horpred32L = _mm_add_epi32(horpred32L,topRow32L); |
868 | 0 | horpred32H = _mm_add_epi32(horpred32H,topRow32H); |
869 | 0 | horpred32L = _mm_add_epi32(horpred32L,offset32); |
870 | 0 | horpred32H = _mm_add_epi32(horpred32H,offset32); |
871 | 0 | horpred32L = _mm_srl_epi32(horpred32L,vFinalShift); |
872 | 0 | horpred32H = _mm_srl_epi32(horpred32H,vFinalShift); |
873 | |
|
874 | 0 | tmpL = _mm_packs_epi32(horpred32L,horpred32H); |
875 | 0 | if (width>=8) |
876 | 0 | _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) ); |
877 | 0 | else if (width==4) |
878 | 0 | _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) ); |
879 | 0 | else if (width==2) |
880 | 0 | _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) ); |
881 | 0 | else |
882 | 0 | pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0); |
883 | |
|
884 | 0 | x16 = _mm_add_epi16(x16,eight); |
885 | 0 | } |
886 | 0 | } |
887 | 0 | } Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&) Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&) |
888 | | |
889 | | template< X86_VEXT vext> |
890 | | void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride) |
891 | 0 | { |
892 | | #ifdef USE_AVX2 |
893 | 0 | if( ( width & 15 ) == 0 ) // width>=16 |
894 | | // if( 0 ) // width>=16 |
895 | 0 | { |
896 | 0 | __m256i vzero = _mm256_set1_epi8(0); |
897 | 0 | __m256i vfour = _mm256_set1_epi32(4); |
898 | 0 | for( int y = 0; y < height; y++ ) |
899 | 0 | { |
900 | 0 | for( int x = 0; x < width; x += 16 ) |
901 | 0 | { |
902 | 0 | int x2=x<<1; |
903 | 0 | __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
904 | 0 | __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
905 | | |
906 | | __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
907 | | __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
908 | | __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
909 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
910 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
911 | | |
912 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
913 | | __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
914 | | |
915 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]); // 7 8 9 10 11 12 13 14 |
916 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 |
917 | | |
918 | | x2+= (int)iRecStride; |
919 | | |
920 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); |
921 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); |
922 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); |
923 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
924 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
925 | | |
926 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
927 | | __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch |
928 | | |
929 | | // jetzt die nächste Zeile dazu |
930 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
931 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
932 | | |
933 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
934 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
935 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
936 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
937 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
938 | | |
939 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
940 | | __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
941 | | |
942 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]); // 7 8 9 10 11 12 13 14 |
943 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 |
944 | | |
945 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); |
946 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); |
947 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); |
948 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
949 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
950 | | |
951 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
952 | | __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile |
953 | | |
954 | | vdst0 = _mm256_add_epi32(vdst0,vdst01); |
955 | | vdst1 = _mm256_add_epi32(vdst1,vdst11); |
956 | | vdst0 = _mm256_add_epi32(vdst0,vfour); |
957 | | vdst1 = _mm256_add_epi32(vdst1,vfour); |
958 | | vdst0 = _mm256_srli_epi32(vdst0,3); |
959 | | vdst1 = _mm256_srli_epi32(vdst1,3); |
960 | | vdst0 = _mm256_packus_epi32 (vdst0,vdst1); // 16 bit |
961 | | vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8); |
962 | |
|
963 | 0 | _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0); |
964 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); |
965 | 0 | } |
966 | 0 | pDst0 += iDstStride; |
967 | 0 | pRecSrc0 += (iRecStride<<1); |
968 | 0 | } |
969 | 0 | } |
970 | 0 | else |
971 | 0 | #endif |
972 | 0 | if( ( width & 7 ) == 0 ) // width>=8 |
973 | 0 | { |
974 | 0 | __m128i vzero = _mm_set1_epi8(0); |
975 | 0 | __m128i vfour = _mm_set1_epi32(4); |
976 | | |
977 | |
|
978 | 0 | for( int y = 0; y < height; y++ ) |
979 | 0 | { |
980 | |
|
981 | 0 | for( int x = 0; x < width; x += 8 ) |
982 | 0 | { |
983 | 0 | int x2=x<<1; |
984 | 0 | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
985 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
986 | |
|
987 | 0 | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
988 | 0 | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
989 | 0 | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
990 | 0 | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
991 | 0 | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
992 | |
|
993 | 0 | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
994 | 0 | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
995 | |
|
996 | 0 | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]); // 7 8 9 10 11 12 13 14 |
997 | 0 | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 |
998 | |
|
999 | 0 | x2+=(int)iRecStride; |
1000 | |
|
1001 | 0 | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); |
1002 | 0 | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); |
1003 | 0 | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); |
1004 | 0 | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1005 | 0 | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1006 | |
|
1007 | 0 | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1008 | 0 | __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch |
1009 | | |
1010 | | // jetzt die nächste Zeile dazu |
1011 | 0 | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
1012 | 0 | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
1013 | |
|
1014 | 0 | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1015 | 0 | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1016 | 0 | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1017 | 0 | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1018 | 0 | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1019 | |
|
1020 | 0 | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1021 | 0 | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
1022 | |
|
1023 | 0 | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]); // 7 8 9 10 11 12 13 14 |
1024 | 0 | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 |
1025 | |
|
1026 | 0 | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); |
1027 | 0 | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); |
1028 | 0 | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); |
1029 | 0 | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1030 | 0 | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1031 | |
|
1032 | 0 | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1033 | 0 | __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile |
1034 | |
|
1035 | 0 | vdst0 = _mm_add_epi32(vdst0,vdst01); |
1036 | 0 | vdst1 = _mm_add_epi32(vdst1,vdst11); |
1037 | 0 | vdst0 = _mm_add_epi32(vdst0,vfour); |
1038 | 0 | vdst1 = _mm_add_epi32(vdst1,vfour); |
1039 | 0 | vdst0 = _mm_srli_epi32(vdst0,3); |
1040 | 0 | vdst1 = _mm_srli_epi32(vdst1,3); |
1041 | 0 | vdst0 = _mm_packus_epi32 (vdst0,vdst1); // 16 bit |
1042 | |
|
1043 | 0 | _mm_storeu_si128((__m128i*)&pDst0[x], vdst0); |
1044 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); |
1045 | 0 | } |
1046 | 0 | pDst0 += iDstStride; |
1047 | 0 | pRecSrc0 += (iRecStride<<1); |
1048 | 0 | } |
1049 | 0 | } |
1050 | 0 | else // width<=4 |
1051 | 0 | { |
1052 | 0 | __m128i vzero = _mm_set1_epi8(0); |
1053 | 0 | __m128i vfour = _mm_set1_epi32(4); |
1054 | |
|
1055 | 0 | for( int y = 0; y < height; y++ ) |
1056 | 0 | { |
1057 | 0 | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]); // -1 0 1 2 3 4 5 6 |
1058 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]); // 0 1 2 3 4 5 6 7 |
1059 | |
|
1060 | 0 | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1061 | 0 | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1062 | 0 | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1063 | 0 | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1064 | 0 | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1065 | |
|
1066 | 0 | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1067 | 0 | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
1068 | | |
1069 | | // jetzt die nächste Zeile dazu |
1070 | 0 | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]); // -1 0 1 2 3 4 5 6 |
1071 | 0 | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]); // 0 1 2 3 4 5 6_mm_storeu_si32 7 |
1072 | |
|
1073 | 0 | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1074 | 0 | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1075 | 0 | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1076 | 0 | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1077 | 0 | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1078 | |
|
1079 | 0 | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1080 | 0 | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
1081 | | |
1082 | |
|
1083 | 0 | vdst0 = _mm_add_epi32(vdst0,vdst01); |
1084 | 0 | vdst0 = _mm_add_epi32(vdst0,vfour); |
1085 | 0 | vdst0 = _mm_srli_epi32(vdst0,3); |
1086 | 0 | vdst0 = _mm_packus_epi32 (vdst0,vdst0); // 16 bit |
1087 | |
|
1088 | 0 | if (width==4) |
1089 | 0 | _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) ); |
1090 | 0 | else if (width==2) |
1091 | 0 | _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) ); |
1092 | 0 | else |
1093 | 0 | { |
1094 | 0 | int tmp = _mm_cvtsi128_si32(vdst0); |
1095 | 0 | pDst0[0] = (Pel) tmp; |
1096 | 0 | } |
1097 | |
|
1098 | 0 | pDst0 += iDstStride; |
1099 | 0 | pRecSrc0 += (iRecStride<<1); |
1100 | 0 | } |
1101 | 0 | } |
1102 | 0 | } Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, short const*, long, short*, long) Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, short const*, long, short*, long) |
1103 | | |
1104 | | |
1105 | | |
1106 | | template<X86_VEXT vext> |
1107 | | void IntraPrediction::_initIntraPredictionX86() |
1108 | 0 | { |
1109 | 0 | IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>; |
1110 | 0 | IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>; |
1111 | 0 | IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>; |
1112 | 0 | IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>; |
1113 | |
|
1114 | 0 | IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>; |
1115 | 0 | IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>; |
1116 | |
|
1117 | 0 | xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>; |
1118 | |
|
1119 | 0 | GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>; |
1120 | |
|
1121 | 0 | } Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)4>() |
1122 | | template void IntraPrediction::_initIntraPredictionX86<SIMDX86>(); |
1123 | | |
1124 | | #endif // TARGET_SIMD_X86 |
1125 | | #endif |
1126 | | |
1127 | | } |