/src/vvenc/source/Lib/CommonLib/x86/IntraPredX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** \file IntraPredX86.h |
43 | | \brief SIMD for IntraPrediction |
44 | | */ |
45 | | |
46 | | #pragma once |
47 | | |
48 | | #include "CommonDefX86.h" |
49 | | #include "Rom.h" |
50 | | #include "IntraPrediction.h" |
51 | | #include "InterpolationFilter.h" |
52 | | |
53 | | #include "Unit.h" |
54 | | |
55 | | #if ENABLE_SIMD_OPT_INTRAPRED |
56 | | #ifdef TARGET_SIMD_X86 |
57 | | //! \ingroup CommonLib |
58 | | //! \{ |
59 | | |
60 | | namespace vvenc { |
61 | | |
62 | | //#define USE_AVX2 |
63 | | template< X86_VEXT vext > |
64 | | void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle) |
65 | 0 | { |
66 | 0 | int deltaInt; |
67 | 0 | int deltaFract; |
68 | 0 | int refMainIndex; |
69 | 0 | __m128i voffset = _mm_set1_epi16(16); |
70 | 0 | if( width >= 8 ) |
71 | 0 | { |
72 | 0 | if( vext >= AVX2 ) |
73 | 0 | { |
74 | | #ifdef USE_AVX2 |
75 | 0 | if (( width & 15 ) == 0 ) |
76 | 0 | { |
77 | 0 | int deltaInt; |
78 | 0 | int deltaFract; |
79 | 0 | int refMainIndex; |
80 | | |
81 | | __m256i voffset = _mm256_set1_epi16(16); |
82 | 0 | for (int k=0; k<height; k++) { |
83 | |
|
84 | 0 | deltaInt = deltaPos >> 5; |
85 | 0 | deltaFract = deltaPos & (32 - 1); |
86 | |
|
87 | 0 | __m256i vfract = _mm256_set1_epi16(deltaFract); |
88 | 0 | __m256i v32minfract = _mm256_set1_epi16(32-deltaFract); |
89 | | // Do linear filtering |
90 | 0 | for (int l=0; l<width; l+=16) { |
91 | 0 | refMainIndex = l+ deltaInt+1; |
92 | 0 | __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]); |
93 | 0 | __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]); |
94 | 0 | vpred0 = _mm256_mullo_epi16(v32minfract, vpred0); |
95 | 0 | vpred1 = _mm256_mullo_epi16(vfract, vpred1); |
96 | 0 | __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5); |
97 | 0 | _mm256_storeu_si256((__m256i*)&pDst[l], vpred); |
98 | 0 | } |
99 | 0 | pDst+=dstStride; |
100 | 0 | deltaPos += intraPredAngle; |
101 | 0 | } |
102 | 0 | } |
103 | 0 | else // width==8 |
104 | 0 | { |
105 | 0 | for (int k=0; k<height; k++) |
106 | 0 | { |
107 | 0 | deltaInt = deltaPos >> 5; |
108 | 0 | deltaFract = deltaPos & (32 - 1); |
109 | |
|
110 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
111 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
112 | | // Do linear filtering |
113 | 0 | for (int l=0; l<width; l+=8) { |
114 | 0 | refMainIndex = l+ deltaInt+1; |
115 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
116 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
117 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
118 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
119 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
120 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); |
121 | 0 | } |
122 | 0 | deltaPos += intraPredAngle; |
123 | |
|
124 | 0 | pDst+=dstStride; |
125 | 0 | } |
126 | |
|
127 | 0 | } |
128 | | #endif |
129 | 0 | } //AVX2 |
130 | 0 | else |
131 | 0 | { |
132 | 0 | for (int k=0; k<height; k++) { |
133 | 0 | deltaInt = deltaPos >> 5; |
134 | 0 | deltaFract = deltaPos & (32 - 1); |
135 | |
|
136 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
137 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
138 | | // Do linear filtering |
139 | 0 | for (int l=0; l<width; l+=8) { |
140 | 0 | refMainIndex = l+ deltaInt+1; |
141 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
142 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
143 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
144 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
145 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
146 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); |
147 | 0 | } |
148 | 0 | deltaPos += intraPredAngle; |
149 | 0 | pDst+=dstStride; |
150 | 0 | } |
151 | 0 | } |
152 | 0 | } |
153 | 0 | else if( width == 4 ) |
154 | 0 | { |
155 | 0 | for (int k=0; k<height; k++) { |
156 | 0 | deltaInt = deltaPos >> 5; |
157 | 0 | deltaFract = deltaPos & (32 - 1); |
158 | |
|
159 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
160 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
161 | | // Do linear filtering |
162 | 0 | refMainIndex = deltaInt+1; |
163 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
164 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
165 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
166 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
167 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
168 | 0 | _vv_storel_epi64( ( __m128i * )(pDst ), vpred); |
169 | 0 | deltaPos += intraPredAngle; |
170 | 0 | pDst+=dstStride; |
171 | 0 | } |
172 | 0 | } |
173 | 0 | else |
174 | 0 | { |
175 | 0 | for (int y = 0; y<height; y++) |
176 | 0 | { |
177 | 0 | const int deltaInt = deltaPos >> 5; |
178 | 0 | const int deltaFract = deltaPos & (32 - 1); |
179 | | |
180 | | // Do linear filtering |
181 | 0 | const Pel* pRM = pBorder + deltaInt + 1; |
182 | 0 | int lastRefMainPel = *pRM++; |
183 | |
|
184 | 0 | for( int x = 0; x < 2; pRM++, x++ ) |
185 | 0 | { |
186 | 0 | int thisRefMainPel = *pRM; |
187 | 0 | pDst[x + 0] = ( Pel ) ( ( ( 32 - deltaFract )*lastRefMainPel + deltaFract*thisRefMainPel + 16 ) >> 5 ); |
188 | 0 | lastRefMainPel = thisRefMainPel; |
189 | 0 | } |
190 | 0 | deltaPos += intraPredAngle; |
191 | 0 | pDst += dstStride; |
192 | 0 | } |
193 | 0 | } |
194 | | #if USE_AVX2 |
195 | | |
196 | | _mm256_zeroupper(); |
197 | | #endif |
198 | 0 | } Unexecuted instantiation: void vvenc::IntraPredAngleChroma_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, long, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraPredAngleChroma_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, long, short*, int, int, int, int) |
199 | | |
200 | | template< X86_VEXT vext > |
201 | | void IntraPredAngleLumaCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff_unused,const bool useCubicFilter,const ClpRng& clpRng) |
202 | 0 | { |
203 | | |
204 | 0 | int16_t* pDst; |
205 | 0 | if( width >= 8 ) |
206 | 0 | { |
207 | |
|
208 | 0 | if( vext >= AVX2 ) |
209 | 0 | { |
210 | | #ifdef USE_AVX2 |
211 | | __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa,0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8,0x7, 0x6, 0x5, 0x4, |
212 | | 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
213 | | __m256i offset = _mm256_set1_epi32( 32 ); |
214 | | |
215 | 0 | if (( width & 15 ) == 0 ) |
216 | 0 | { |
217 | 0 | __m256i vbdmin = _mm256_set1_epi16( clpRng.min() ); |
218 | 0 | __m256i vbdmax = _mm256_set1_epi16( clpRng.max() ); |
219 | |
|
220 | 0 | for (int y = 0; y<height; y++ ) |
221 | 0 | { |
222 | 0 | int deltaInt = deltaPos >> 5; |
223 | 0 | int deltaFract = deltaPos & (32 - 1); |
224 | | |
225 | 0 | const TFilterCoeff intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)}; |
226 | 0 | const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter; |
227 | |
|
228 | 0 | int refMainIndex = deltaInt + 1; |
229 | 0 | pDst=&pDstBuf[y*dstStride]; |
230 | | // __m128i tmp = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
231 | 0 | __m128i tmp = _vv_loadl_epi64( ( __m128i const * )f ); //load 4 16 bit filter coeffs |
232 | 0 | tmp = _mm_shuffle_epi32(tmp,0x44); |
233 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(tmp); |
234 | 0 | for( int x = 0; x < width; x+=16) |
235 | 0 | { |
236 | 0 | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex - 1] ) ); |
237 | 0 | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) ); |
238 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
239 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
240 | |
|
241 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); |
242 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); |
243 | |
|
244 | 0 | __m256i sum = _mm256_hadd_epi32( src1, src2 ); |
245 | 0 | sum = _mm256_permute4x64_epi64(sum,0xD8); |
246 | |
|
247 | 0 | sum = _mm256_add_epi32( sum, offset ); |
248 | 0 | sum = _mm256_srai_epi32( sum, 6 ); |
249 | |
|
250 | 0 | refMainIndex+=8; |
251 | |
|
252 | 0 | src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex - 1] ) ); |
253 | 0 | src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) ); |
254 | |
|
255 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
256 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
257 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); |
258 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); |
259 | |
|
260 | 0 | __m256i sum1 = _mm256_hadd_epi32( src1, src2 ); |
261 | 0 | sum1 = _mm256_permute4x64_epi64(sum1,0xD8); |
262 | |
|
263 | 0 | sum1 = _mm256_add_epi32( sum1, offset ); |
264 | 0 | sum1 = _mm256_srai_epi32( sum1, 6 ); |
265 | 0 | __m256i |
266 | 0 | src0 = _mm256_packs_epi32( sum, sum1 ); |
267 | |
|
268 | 0 | src0 = _mm256_permute4x64_epi64(src0,0xD8); |
269 | |
|
270 | 0 | refMainIndex+=8; |
271 | |
|
272 | 0 | if (useCubicFilter) |
273 | 0 | src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) ); |
274 | |
|
275 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst + x), src0); |
276 | 0 | } |
277 | 0 | deltaPos += intraPredAngle; |
278 | 0 | } |
279 | 0 | } |
280 | 0 | else // width =8 |
281 | 0 | { |
282 | 0 | __m128i vbdmin = _mm_set1_epi16( clpRng.min() ); |
283 | 0 | __m128i vbdmax = _mm_set1_epi16( clpRng.max() ); |
284 | |
|
285 | 0 | for (int y = 0; y<height; y++ ) |
286 | 0 | { |
287 | 0 | int deltaInt = deltaPos >> 5; |
288 | 0 | int deltaFract = deltaPos & (32 - 1); |
289 | |
|
290 | 0 | const TFilterCoeff intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)}; |
291 | 0 | const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter; |
292 | |
|
293 | 0 | int refMainIndex = deltaInt + 1; |
294 | 0 | pDst=&pDstBuf[y*dstStride]; |
295 | | // __m128i tmp = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
296 | 0 | __m128i tmp = _mm_loadu_si64( f ); //load 4 16 bit filter coeffs |
297 | 0 | tmp = _mm_shuffle_epi32(tmp,0x44); |
298 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(tmp); |
299 | 0 | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex - 1] ) ); |
300 | 0 | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) ); |
301 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
302 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
303 | |
|
304 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); |
305 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); |
306 | |
|
307 | 0 | __m256i sum = _mm256_hadd_epi32( src1, src2 ); |
308 | 0 | sum = _mm256_permute4x64_epi64(sum,0xD8); |
309 | |
|
310 | 0 | sum = _mm256_add_epi32( sum, offset ); |
311 | 0 | sum = _mm256_srai_epi32( sum, 6 ); |
312 | 0 | __m256i |
313 | 0 | src0 = _mm256_permute4x64_epi64( _mm256_packs_epi32( sum, sum ), 0x88 ); |
314 | 0 | __m128i dest128 = _mm256_castsi256_si128( src0); |
315 | |
|
316 | 0 | if (useCubicFilter) |
317 | 0 | dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) ); |
318 | |
|
319 | 0 | _mm_storeu_si128( ( __m128i * )(pDst), dest128); |
320 | 0 | deltaPos += intraPredAngle; |
321 | 0 | } |
322 | 0 | } |
323 | | #endif |
324 | 0 | } |
325 | 0 | else |
326 | 0 | { |
327 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
328 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); |
329 | 0 | __m128i offset = _mm_set1_epi32( 32 ); |
330 | 0 | __m128i vbdmin = _mm_set1_epi16( clpRng.min() ); |
331 | 0 | __m128i vbdmax = _mm_set1_epi16( clpRng.max() ); |
332 | |
|
333 | 0 | for (int y = 0; y<height; y++ ) |
334 | 0 | { |
335 | 0 | int deltaInt = deltaPos >> 5; |
336 | 0 | int deltaFract = deltaPos & (32 - 1); |
337 | |
|
338 | 0 | const TFilterCoeff intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)}; |
339 | 0 | const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter; |
340 | |
|
341 | 0 | int refMainIndex = deltaInt + 1; |
342 | 0 | pDst=&pDstBuf[y*dstStride]; |
343 | 0 | __m128i coeff = _mm_loadu_si64( f ); //load 4 16 bit filter coeffs |
344 | | // __m128i coeff = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
345 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); |
346 | 0 | for( int x = 0; x < width; x+=8) |
347 | 0 | { |
348 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
349 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
350 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 |
351 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
352 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
353 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); |
354 | 0 | sum = _mm_add_epi32( sum, offset ); |
355 | 0 | sum = _mm_srai_epi32( sum, 6 ); |
356 | |
|
357 | 0 | refMainIndex+=4; |
358 | 0 | src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
359 | 0 | src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
360 | 0 | src2 = _mm_shuffle_epi8(src0,shflmask2); |
361 | | |
362 | | // 1 2 3 4 2 3 4 5 |
363 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
364 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
365 | |
|
366 | 0 | __m128i sum1 = _mm_hadd_epi32( src0, src1 ); |
367 | 0 | sum1 = _mm_add_epi32( sum1, offset ); |
368 | 0 | sum1 = _mm_srai_epi32( sum1, 6 ); |
369 | 0 | src0 = _mm_packs_epi32( sum, sum1 ); |
370 | |
|
371 | 0 | refMainIndex+=4; |
372 | 0 | if (useCubicFilter) |
373 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); |
374 | |
|
375 | 0 | _mm_storeu_si128( ( __m128i * )(pDst + x), src0); |
376 | |
|
377 | 0 | } |
378 | 0 | deltaPos += intraPredAngle; |
379 | 0 | } |
380 | 0 | } |
381 | 0 | } |
382 | 0 | else if( width == 4 ) |
383 | 0 | { |
384 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
385 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); |
386 | 0 | __m128i offset = _mm_set1_epi32( 32 ); |
387 | 0 | __m128i vbdmin = _mm_set1_epi16( clpRng.min() ); |
388 | 0 | __m128i vbdmax = _mm_set1_epi16( clpRng.max() ); |
389 | |
|
390 | 0 | for (int y = 0; y<height; y++ ) |
391 | 0 | { |
392 | 0 | int deltaInt = deltaPos >> 5; |
393 | 0 | int deltaFract = deltaPos & (32 - 1); |
394 | | |
395 | 0 | const TFilterCoeff intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)}; |
396 | 0 | const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter; |
397 | | |
398 | 0 | int refMainIndex = deltaInt + 1; |
399 | 0 | pDst=&pDstBuf[y*dstStride]; |
400 | 0 | __m128i coeff = _mm_loadu_si64( f ); //load 4 16 bit filter coeffs |
401 | | // __m128i coeff = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
402 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); |
403 | 0 | { |
404 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
405 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
406 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 |
407 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
408 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
409 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); |
410 | 0 | sum = _mm_add_epi32( sum, offset ); |
411 | 0 | sum = _mm_srai_epi32( sum, 6 ); |
412 | |
|
413 | 0 | src0 = _mm_packs_epi32( sum, sum ); |
414 | |
|
415 | 0 | refMainIndex+=4; |
416 | |
|
417 | 0 | if (useCubicFilter) |
418 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); |
419 | |
|
420 | 0 | _vv_storel_epi64( ( __m128i * )(pDst ), src0); |
421 | |
|
422 | 0 | } |
423 | 0 | deltaPos += intraPredAngle; |
424 | 0 | } |
425 | 0 | } |
426 | 0 | else |
427 | 0 | { |
428 | 0 | THROW( "Unsupported size in IntraPredAngleCore_SIMD" ); |
429 | 0 | } |
430 | | #if USE_AVX2 |
431 | 0 | _mm256_zeroupper(); |
432 | 0 | #endif |
433 | 0 | } Unexecuted instantiation: void vvenc::IntraPredAngleLumaCore_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, long, short*, int, int, int, int, short const*, bool, vvenc::ClpRng const&) Unexecuted instantiation: void vvenc::IntraPredAngleLumaCore_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, long, short*, int, int, int, int, short const*, bool, vvenc::ClpRng const&) |
434 | | |
435 | | template< X86_VEXT vext, int W > |
436 | | void IntraPredSampleFilter_SIMD(PelBuf& dstBuf, const CPelBuf& Src) |
437 | 0 | { |
438 | 0 | const int iWidth = dstBuf.width; |
439 | 0 | const int iHeight = dstBuf.height; |
440 | 0 | Pel* pDst = dstBuf.buf; |
441 | 0 | const ptrdiff_t dstStride=dstBuf.stride; |
442 | |
|
443 | 0 | const Pel* ptrSrc = Src.buf; |
444 | 0 | const ptrdiff_t srcStride=Src.stride; |
445 | |
|
446 | 0 | const int scale = ((floorLog2(iWidth * iHeight) - 2) >> 2); |
447 | 0 | CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2"); |
448 | |
|
449 | | #if USE_AVX2 |
450 | 0 | if( W > 8 ) |
451 | 0 | { |
452 | 0 | __m256i tmplo,tmphi; |
453 | 0 | __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32); |
454 | 0 | __m256i wl16,wl16start; |
455 | | |
456 | | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); |
457 | | |
458 | 0 | if (scale==1) |
459 | 0 | { |
460 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); |
461 | 0 | } |
462 | 0 | else if (scale==2) |
463 | 0 | { |
464 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); |
465 | 0 | } |
466 | | |
467 | 0 | for (int y = 0; y < iHeight; y++) |
468 | 0 | { |
469 | 0 | int wT = 32 >> std::min(31, ((y << 1) >> scale)); |
470 | |
|
471 | 0 | __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT); |
472 | 0 | __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)+srcStride)))); |
473 | 0 | if (wT) |
474 | 0 | { |
475 | 0 | for (int x = 0; x < iWidth; x+=16) |
476 | 0 | { |
477 | 0 | if (x==0) |
478 | 0 | { |
479 | 0 | wl16=wl16start; |
480 | |
|
481 | 0 | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top |
482 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst |
483 | |
|
484 | 0 | tmphi = _mm256_sub_epi16(x16left,x16dst); |
485 | 0 | tmplo = _mm256_mullo_epi16(tmphi,wl16); //wL * left-val |
486 | 0 | tmphi = _mm256_mulhi_epi16(tmphi,wl16); //wL * left-val |
487 | 0 | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
488 | 0 | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
489 | |
|
490 | 0 | x16top = _mm256_sub_epi16(x16top,x16dst); |
491 | 0 | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top-val |
492 | 0 | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top-val |
493 | 0 | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); |
494 | 0 | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); |
495 | |
|
496 | 0 | __m256i dstlo = _mm256_add_epi32(leftlo,toplo); |
497 | 0 | __m256i dsthi = _mm256_add_epi32(lefthi,tophi); |
498 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); |
499 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); |
500 | |
|
501 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); |
502 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); |
503 | |
|
504 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
505 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
506 | |
|
507 | 0 | dstlo = _mm256_adds_epi16(dstlo,x16dst); |
508 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); |
509 | 0 | } |
510 | 0 | else |
511 | 0 | { |
512 | 0 | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top |
513 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst |
514 | |
|
515 | 0 | x16top = _mm256_sub_epi16(x16top,x16dst); |
516 | 0 | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top-val |
517 | 0 | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top-val |
518 | 0 | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); |
519 | 0 | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); |
520 | |
|
521 | 0 | __m256i dstlo = _mm256_add_epi32(toplo,w32); |
522 | 0 | __m256i dsthi = _mm256_add_epi32(tophi,w32); |
523 | |
|
524 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); |
525 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); |
526 | |
|
527 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
528 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
529 | |
|
530 | 0 | dstlo = _mm256_adds_epi16(dstlo,x16dst); |
531 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); |
532 | 0 | } |
533 | 0 | } // for x |
534 | 0 | } |
535 | 0 | else |
536 | 0 | { // wT =0 |
537 | 0 | wl16=wl16start; |
538 | |
|
539 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst |
540 | |
|
541 | 0 | tmphi = _mm256_sub_epi16(x16left,x16dst); |
542 | 0 | tmplo = _mm256_mullo_epi16(tmphi,wl16); //wL * left-val |
543 | 0 | tmphi = _mm256_mulhi_epi16(tmphi,wl16); //wL * left-val |
544 | 0 | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
545 | 0 | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
546 | |
|
547 | 0 | __m256i dstlo = _mm256_add_epi32(leftlo,w32); |
548 | 0 | __m256i dsthi = _mm256_add_epi32(lefthi,w32); |
549 | |
|
550 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); |
551 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); |
552 | |
|
553 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
554 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
555 | |
|
556 | 0 | dstlo = _mm256_adds_epi16(dstlo,x16dst); |
557 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo ); |
558 | 0 | } |
559 | 0 | } |
560 | 0 | } |
561 | 0 | else |
562 | 0 | #endif |
563 | 0 | { |
564 | 0 | __m128i tmplo8,tmphi8; |
565 | 0 | __m128i w32_8 = _mm_set_epi32(32,32,32,32); |
566 | 0 | __m128i wl8start,wl8start2; |
567 | 0 | CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2"); |
568 | |
|
569 | 0 | wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32); |
570 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); |
571 | | |
572 | 0 | if (scale==1) |
573 | 0 | { |
574 | 0 | wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32); |
575 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); |
576 | 0 | } |
577 | 0 | else if (scale==2) |
578 | 0 | { |
579 | 0 | wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32); |
580 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2); |
581 | 0 | } |
582 | |
|
583 | 0 | __m128i wl8 = wl8start; |
584 | 0 | for (int y = 0; y < iHeight; y++) |
585 | 0 | { |
586 | 0 | int wT = 32 >> std::min(31, ((y << 1) >> scale)); |
587 | |
|
588 | 0 | __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT); |
589 | 0 | __m128i x8left; |
590 | | |
591 | |
|
592 | 0 | if ( W == 4 ) |
593 | 0 | { |
594 | 0 | x8left = _mm_loadu_si64 ((__m128i const *) (ptrSrc+((y+1)+srcStride))); |
595 | 0 | } |
596 | 0 | else if ( W == 2 ) |
597 | 0 | { |
598 | 0 | x8left = _mm_loadu_si32 ((__m128i const *) (ptrSrc+((y+1)+srcStride))); |
599 | 0 | } |
600 | 0 | else |
601 | 0 | { |
602 | 0 | x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)+srcStride))); |
603 | 0 | } |
604 | 0 | x8left =_mm_shufflelo_epi16(x8left,0); |
605 | 0 | x8left =_mm_shuffle_epi32(x8left,0); |
606 | | |
607 | |
|
608 | 0 | if (wT) |
609 | 0 | { |
610 | 0 | for (int x = 0; x < iWidth; x+=8) |
611 | 0 | { |
612 | 0 | if (x>8) |
613 | 0 | { |
614 | 0 | __m128i x8top; |
615 | 0 | __m128i x8dst; |
616 | |
|
617 | 0 | if ( W == 4 ) |
618 | 0 | { |
619 | 0 | x8top = _mm_loadu_si64((__m128i *) (ptrSrc+x+1)); // load top |
620 | 0 | x8dst = _mm_loadu_si64((const __m128i *) (pDst+y*dstStride+x)); // load dst |
621 | 0 | } |
622 | 0 | else if ( W == 2 ) |
623 | 0 | { |
624 | 0 | x8top = _mm_loadu_si32((__m128i *) (ptrSrc+x+1)); // load top |
625 | 0 | x8dst = _mm_loadu_si32((const __m128i *) (pDst+y*dstStride+x)); // load dst |
626 | 0 | } |
627 | 0 | else |
628 | 0 | { |
629 | 0 | x8top = _mm_loadu_si128((__m128i *) (ptrSrc+x+1)); // load top |
630 | 0 | x8dst = _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst |
631 | 0 | } |
632 | |
|
633 | 0 | tmphi8 = _mm_sub_epi16(x8top,x8dst); |
634 | 0 | tmplo8 = _mm_mullo_epi16(tmphi8,wt8); // wT*top-val |
635 | 0 | tmphi8 = _mm_mulhi_epi16(tmphi8,wt8); // wT*top-val |
636 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
637 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
638 | |
|
639 | 0 | __m128i dstlo8 = _mm_add_epi32(toplo8,w32_8); |
640 | 0 | __m128i dsthi8 = _mm_add_epi32(tophi8,w32_8); |
641 | |
|
642 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); |
643 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); |
644 | |
|
645 | 0 | dstlo8 = _mm_packs_epi32(dstlo8,dsthi8); |
646 | 0 | dstlo8 = _mm_adds_epi16(dstlo8,x8dst); |
647 | |
|
648 | 0 | _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) ); |
649 | |
|
650 | 0 | } |
651 | 0 | else // x<=8 |
652 | 0 | { |
653 | | |
654 | 0 | if (x==0) |
655 | 0 | wl8=wl8start; |
656 | 0 | else if (x==8) |
657 | 0 | wl8=wl8start2; |
658 | |
|
659 | 0 | __m128i x8top; |
660 | 0 | __m128i x8dst; |
661 | |
|
662 | 0 | if ( W == 4 ) |
663 | 0 | { |
664 | 0 | x8top = _mm_loadu_si64((__m128i *) (ptrSrc+x+1)); // load top |
665 | 0 | x8dst = _mm_loadu_si64((const __m128i *) (pDst+y*dstStride+x)); // load dst |
666 | 0 | } |
667 | 0 | else if ( W == 2 ) |
668 | 0 | { |
669 | 0 | x8top = _mm_loadu_si32((__m128i *) (ptrSrc+x+1)); // load top |
670 | 0 | x8dst = _mm_loadu_si32((const __m128i *) (pDst+y*dstStride+x)); // load dst |
671 | 0 | } |
672 | 0 | else |
673 | 0 | { |
674 | 0 | x8top = _mm_loadu_si128((__m128i *) (ptrSrc+x+1)); // load top |
675 | 0 | x8dst = _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst |
676 | 0 | } |
677 | 0 | tmphi8 = _mm_sub_epi16(x8left,x8dst); |
678 | 0 | tmplo8 = _mm_mullo_epi16(tmphi8,wl8); //wL * left-val |
679 | 0 | tmphi8 = _mm_mulhi_epi16(tmphi8,wl8); //wL * left-val |
680 | 0 | __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
681 | 0 | __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
682 | |
|
683 | 0 | tmphi8 = _mm_sub_epi16(x8top,x8dst); |
684 | 0 | tmplo8 = _mm_mullo_epi16(tmphi8,wt8); // wT*top-val |
685 | 0 | tmphi8 = _mm_mulhi_epi16(tmphi8,wt8); // wT*top-val |
686 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
687 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
688 | |
|
689 | 0 | __m128i dstlo8 = _mm_add_epi32(leftlo8,toplo8); |
690 | 0 | __m128i dsthi8 = _mm_add_epi32(lefthi8,tophi8); |
691 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); |
692 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); |
693 | |
|
694 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); |
695 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); |
696 | |
|
697 | 0 | dstlo8 = _mm_packs_epi32(dstlo8,dsthi8); |
698 | 0 | dstlo8 = _mm_adds_epi16(dstlo8,x8dst); |
699 | |
|
700 | 0 | if (W>=8) |
701 | 0 | _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) ); |
702 | 0 | else if (W==4) |
703 | 0 | _vv_storel_epi64(( __m128i * )(pDst+y*dstStride+x), (dstlo8) ); |
704 | 0 | else if (W==2) |
705 | 0 | _mm_storeu_si32(( __m128i * )(pDst+y*dstStride+x),(dstlo8) ); |
706 | 0 | } |
707 | 0 | } |
708 | 0 | } |
709 | 0 | else //wT =0 |
710 | 0 | { |
711 | 0 | for (int x = 0; x < std::min(iWidth,16); x+=8) |
712 | 0 | { |
713 | 0 | if (x==0) |
714 | 0 | wl8=wl8start; |
715 | 0 | else |
716 | 0 | wl8=wl8start2; |
717 | |
|
718 | 0 | __m128i x8dst ; |
719 | |
|
720 | 0 | if ( W == 4 ) |
721 | 0 | { |
722 | 0 | x8dst = _mm_loadu_si64((const __m128i *) (pDst+y*dstStride+x)); // load dst |
723 | 0 | } |
724 | 0 | else if ( W == 2 ) |
725 | 0 | { |
726 | 0 | x8dst = _mm_loadu_si32((const __m128i *) (pDst+y*dstStride+x)); // load dst |
727 | 0 | } |
728 | 0 | else |
729 | 0 | { |
730 | 0 | x8dst = _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst |
731 | 0 | } |
732 | 0 | tmphi8 = _mm_sub_epi16(x8left,x8dst); |
733 | 0 | tmplo8 = _mm_mullo_epi16(tmphi8,wl8); //wL * left-val |
734 | 0 | tmphi8 = _mm_mulhi_epi16(tmphi8,wl8); //wL * left-val |
735 | 0 | __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
736 | 0 | __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
737 | |
|
738 | 0 | __m128i dstlo8 = _mm_add_epi32(leftlo8,w32_8); |
739 | 0 | __m128i dsthi8 = _mm_add_epi32(lefthi8,w32_8); |
740 | |
|
741 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); |
742 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); |
743 | |
|
744 | 0 | dstlo8 = _mm_packs_epi32(dstlo8,dsthi8); |
745 | 0 | dstlo8 = _mm_adds_epi16(dstlo8,x8dst); |
746 | |
|
747 | 0 | if (W>=8) |
748 | 0 | _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) ); |
749 | 0 | else if (W==4) |
750 | 0 | _vv_storel_epi64(( __m128i * )(pDst+y*dstStride+x), (dstlo8) ); |
751 | 0 | else if (W==2) |
752 | 0 | _mm_storeu_si32(( __m128i * )(pDst+y*dstStride+x),(dstlo8) ); |
753 | 0 | } |
754 | 0 | } |
755 | 0 | } |
756 | 0 | } |
757 | | |
758 | |
|
759 | 0 | } Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 16>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 8>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 2>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 16>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 8>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 2>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) |
760 | | |
761 | | template< X86_VEXT vext > |
762 | | void IntraPredSampleFilter_SIMD(PelBuf& dstBuf, const CPelBuf& srcBuf) |
763 | 0 | { |
764 | 0 | const int iWidth = dstBuf.width; |
765 | |
|
766 | 0 | if (iWidth>8) |
767 | 0 | IntraPredSampleFilter_SIMD<vext,16>(dstBuf, srcBuf); |
768 | 0 | else if (iWidth==8) |
769 | 0 | IntraPredSampleFilter_SIMD<vext,8>(dstBuf, srcBuf); |
770 | 0 | else if (iWidth==4) |
771 | 0 | IntraPredSampleFilter_SIMD<vext,4>(dstBuf, srcBuf); |
772 | 0 | else |
773 | 0 | IntraPredSampleFilter_SIMD<vext,2>(dstBuf, srcBuf); |
774 | |
|
775 | | #if USE_AVX2 |
776 | | _mm256_zeroupper(); |
777 | | #endif |
778 | 0 | } Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) |
779 | | |
780 | | |
781 | | /** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding). |
782 | | */ |
783 | | template< X86_VEXT vext> |
784 | | void xPredIntraPlanar_SIMD( PelBuf& pDst, const CPelBuf& pSrc) |
785 | 0 | { |
786 | |
|
787 | 0 | const uint32_t width = pDst.width; |
788 | 0 | const uint32_t height = pDst.height; |
789 | 0 | const uint32_t log2W = floorLog2(width); |
790 | 0 | const uint32_t log2H = floorLog2(height); |
791 | 0 | const uint32_t finalShift = 1 + log2W + log2H; |
792 | 0 | const uint32_t offset = 1 << (log2W + log2H); |
793 | 0 | const ptrdiff_t stride = pDst.stride; |
794 | 0 | Pel* pred = pDst.buf; |
795 | |
|
796 | 0 | const Pel* ptrSrc =pSrc.buf; |
797 | |
|
798 | 0 | int leftColumn,rightColumn; |
799 | 0 | Pel tmp; |
800 | 0 | int topRight = pSrc.at( width + 1, 0 ); |
801 | |
|
802 | 0 | tmp=pSrc.at( height+1, 1 ); |
803 | 0 | const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp); |
804 | 0 | const __m128i zero = _mm_xor_si128(bottomLeft16,bottomLeft16); |
805 | 0 | const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8); |
806 | 0 | const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset); |
807 | 0 | const __m128i vLog2W = _mm_cvtsi32_si128(log2W); |
808 | 0 | const __m128i vLog2H = _mm_cvtsi32_si128(log2H); |
809 | 0 | const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift); |
810 | | |
811 | |
|
812 | 0 | for( int y = 0; y < height; y++) |
813 | 0 | { |
814 | 0 | leftColumn=pSrc.at( y + 1, 1 ); |
815 | 0 | rightColumn = topRight - leftColumn; |
816 | 0 | leftColumn = leftColumn << log2W; |
817 | 0 | __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn); |
818 | 0 | __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn); |
819 | 0 | __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1); |
820 | 0 | __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1); |
821 | |
|
822 | 0 | for( int x = 0; x < width; x+=8 ) |
823 | 0 | { |
824 | | //topRow[x] = pSrc.at( x + 1, 0 ); |
825 | 0 | __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1))); |
826 | | //bottomRow[x] = bottomLeft - topRow[x]; |
827 | 0 | __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16); |
828 | | // (y+1)*bottomRow[x] |
829 | 0 | __m128i tmpH = _mm_mulhi_epi16(bottomRow16L,y16); |
830 | 0 | __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16); |
831 | 0 | bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH); |
832 | 0 | __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH); |
833 | | |
834 | | // (topRow[x] topRow16H<< log2H) |
835 | 0 | __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero); |
836 | 0 | __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero); |
837 | 0 | topRow32L = _mm_sll_epi32(topRow32L,vLog2H); |
838 | 0 | topRow32H = _mm_sll_epi32(topRow32H,vLog2H); |
839 | | // vertPred = (topRow[x] << log2H) + (y+1)*bottomRow[x]; |
840 | 0 | topRow32L = _mm_add_epi32(topRow32L,bottomRow16L); |
841 | 0 | topRow32H = _mm_add_epi32(topRow32H,bottomRow16H); |
842 | | // horPred = leftColumn + (x+1)*rightColumn; |
843 | 0 | tmpL = _mm_mullo_epi16(rightcolumn16,x16); |
844 | 0 | tmpH = _mm_mulhi_epi16(rightcolumn16,x16); |
845 | 0 | __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH); |
846 | 0 | __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH); |
847 | 0 | horpred32L = _mm_add_epi32(leftColumn32,horpred32L); |
848 | 0 | horpred32H = _mm_add_epi32(leftColumn32,horpred32H); |
849 | | // pred[x] = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift; |
850 | 0 | horpred32L = _mm_sll_epi32(horpred32L,vLog2H); |
851 | 0 | horpred32H = _mm_sll_epi32(horpred32H,vLog2H); |
852 | 0 | topRow32L = _mm_sll_epi32(topRow32L,vLog2W); |
853 | 0 | topRow32H = _mm_sll_epi32(topRow32H,vLog2W); |
854 | 0 | horpred32L = _mm_add_epi32(horpred32L,topRow32L); |
855 | 0 | horpred32H = _mm_add_epi32(horpred32H,topRow32H); |
856 | 0 | horpred32L = _mm_add_epi32(horpred32L,offset32); |
857 | 0 | horpred32H = _mm_add_epi32(horpred32H,offset32); |
858 | 0 | horpred32L = _mm_srl_epi32(horpred32L,vFinalShift); |
859 | 0 | horpred32H = _mm_srl_epi32(horpred32H,vFinalShift); |
860 | |
|
861 | 0 | tmpL = _mm_packs_epi32(horpred32L,horpred32H); |
862 | 0 | if (width>=8) |
863 | 0 | _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) ); |
864 | 0 | else if (width==4) |
865 | 0 | _vv_storel_epi64(( __m128i * )(pred+y*stride+x), (tmpL) ); |
866 | 0 | else if (width==2) |
867 | 0 | _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) ); |
868 | 0 | else |
869 | 0 | pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0); |
870 | |
|
871 | 0 | x16 = _mm_add_epi16(x16,eight); |
872 | 0 | } |
873 | 0 | } |
874 | 0 | } Unexecuted instantiation: void vvenc::xPredIntraPlanar_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) Unexecuted instantiation: void vvenc::xPredIntraPlanar_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&) |
875 | | |
876 | | template< X86_VEXT vext> |
877 | | void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride) |
878 | | { |
879 | | #ifdef USE_AVX2 |
880 | | if( ( width & 15 ) == 0 ) // width>=16 |
881 | | { |
882 | | __m256i vzero = _mm256_set1_epi8(0); |
883 | | __m256i vfour = _mm256_set1_epi32(4); |
884 | | for( int y = 0; y < height; y++ ) |
885 | | { |
886 | | for( int x = 0; x < width; x += 16 ) |
887 | | { |
888 | | int x2=x<<1; |
889 | | __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
890 | | __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
891 | | |
892 | | __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
893 | | __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
894 | | __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
895 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
896 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
897 | | |
898 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
899 | | __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
900 | | |
901 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]); // 7 8 9 10 11 12 13 14 |
902 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 |
903 | | |
904 | | x2+= (int)iRecStride; |
905 | | |
906 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); |
907 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); |
908 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); |
909 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
910 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
911 | | |
912 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
913 | | __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch |
914 | | |
915 | | // jetzt die nächste Zeile dazu |
916 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
917 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
918 | | |
919 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
920 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
921 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
922 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
923 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
924 | | |
925 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
926 | | __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
927 | | |
928 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]); // 7 8 9 10 11 12 13 14 |
929 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 |
930 | | |
931 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); |
932 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); |
933 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); |
934 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
935 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
936 | | |
937 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
938 | | __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile |
939 | | |
940 | | vdst0 = _mm256_add_epi32(vdst0,vdst01); |
941 | | vdst1 = _mm256_add_epi32(vdst1,vdst11); |
942 | | vdst0 = _mm256_add_epi32(vdst0,vfour); |
943 | | vdst1 = _mm256_add_epi32(vdst1,vfour); |
944 | | vdst0 = _mm256_srli_epi32(vdst0,3); |
945 | | vdst1 = _mm256_srli_epi32(vdst1,3); |
946 | | vdst0 = _mm256_packus_epi32 (vdst0,vdst1); // 16 bit |
947 | | vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8); |
948 | | |
949 | | _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0); |
950 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); |
951 | | } |
952 | | pDst0 += iDstStride; |
953 | | pRecSrc0 += (iRecStride<<1); |
954 | | } |
955 | | } |
956 | | else |
957 | | #endif |
958 | | if( ( width & 7 ) == 0 ) // width>=8 |
959 | | { |
960 | | __m128i vzero = _mm_set1_epi8(0); |
961 | | __m128i vfour = _mm_set1_epi32(4); |
962 | | |
963 | | |
964 | | for( int y = 0; y < height; y++ ) |
965 | | { |
966 | | |
967 | | for( int x = 0; x < width; x += 8 ) |
968 | | { |
969 | | int x2=x<<1; |
970 | | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
971 | | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
972 | | |
973 | | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
974 | | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
975 | | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
976 | | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
977 | | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
978 | | |
979 | | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
980 | | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
981 | | |
982 | | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]); // 7 8 9 10 11 12 13 14 |
983 | | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 |
984 | | |
985 | | x2+=(int)iRecStride; |
986 | | |
987 | | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); |
988 | | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); |
989 | | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); |
990 | | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
991 | | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
992 | | |
993 | | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
994 | | __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch |
995 | | |
996 | | // jetzt die nächste Zeile dazu |
997 | | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
998 | | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
999 | | |
1000 | | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1001 | | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1002 | | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1003 | | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1004 | | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1005 | | |
1006 | | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1007 | | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
1008 | | |
1009 | | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]); // 7 8 9 10 11 12 13 14 |
1010 | | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 |
1011 | | |
1012 | | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); |
1013 | | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); |
1014 | | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); |
1015 | | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1016 | | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1017 | | |
1018 | | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1019 | | __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile |
1020 | | |
1021 | | vdst0 = _mm_add_epi32(vdst0,vdst01); |
1022 | | vdst1 = _mm_add_epi32(vdst1,vdst11); |
1023 | | vdst0 = _mm_add_epi32(vdst0,vfour); |
1024 | | vdst1 = _mm_add_epi32(vdst1,vfour); |
1025 | | vdst0 = _mm_srli_epi32(vdst0,3); |
1026 | | vdst1 = _mm_srli_epi32(vdst1,3); |
1027 | | vdst0 = _mm_packus_epi32 (vdst0,vdst1); // 16 bit__m256i wl16start; |
1028 | | |
1029 | | _mm_storeu_si128((__m128i*)&pDst0[x], vdst0); |
1030 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); |
1031 | | } |
1032 | | pDst0 += iDstStride; |
1033 | | pRecSrc0 += (iRecStride<<1); |
1034 | | } |
1035 | | } |
1036 | | else // width<=4 |
1037 | | { |
1038 | | __m128i vzero = _mm_set1_epi8(0); |
1039 | | __m128i vfour = _mm_set1_epi32(4); |
1040 | | |
1041 | | for( int y = 0; y < height; y++ ) |
1042 | | { |
1043 | | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]); // -1 0 1 2 3 4 5 6 |
1044 | | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]); // 0 1 2 3 4 5 6 7 |
1045 | | |
1046 | | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1047 | | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1048 | | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1049 | | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1050 | | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1051 | | |
1052 | | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1053 | | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
1054 | | |
1055 | | // jetzt die nächste Zeile dazu |
1056 | | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]); // -1 0 1 2 3 4 5 6 |
1057 | | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]); // 0 1 2 3 4 5 6_mm_storeu_si32 7 |
1058 | | |
1059 | | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1060 | | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1061 | | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1062 | | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1063 | | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1064 | | |
1065 | | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1066 | | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
1067 | | |
1068 | | |
1069 | | vdst0 = _mm_add_epi32(vdst0,vdst01); |
1070 | | vdst0 = _mm_add_epi32(vdst0,vfour); |
1071 | | vdst0 = _mm_srli_epi32(vdst0,3); |
1072 | | vdst0 = _mm_packus_epi32 (vdst0,vdst0); // 16 bit |
1073 | | |
1074 | | if (width==4) |
1075 | | _vv_storel_epi64(( __m128i * )&pDst0[0], (vdst0) ); |
1076 | | else if (width==2) |
1077 | | _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) ); |
1078 | | else |
1079 | | { |
1080 | | int tmp = _mm_cvtsi128_si32(vdst0); |
1081 | | pDst0[0] = (Pel) tmp; |
1082 | | } |
1083 | | |
1084 | | pDst0 += iDstStride; |
1085 | | pRecSrc0 += (iRecStride<<1); |
1086 | | } |
1087 | | } |
1088 | | } |
1089 | | |
1090 | | |
1091 | | template<X86_VEXT vext, int W > |
1092 | | void IntraAnglePDPC_SIMD(Pel* pDsty,const int dstStride,Pel* refSide,const int width,const int height,int scale,int invAngle) |
1093 | 0 | { |
1094 | |
|
1095 | 0 | if (W>=16) |
1096 | 0 | { |
1097 | | #ifdef USE_AVX2 |
1098 | 0 | ALIGN_DATA( MEMORY_ALIGN_DEF_SIZE,short ref[16]); |
1099 | | VALGRIND_MEMCLEAR( ref, sizeof( ref ) ); |
1100 | | |
1101 | | // Pel dummy[16]; |
1102 | | // Pel* pdum=&dummy[0]; |
1103 | | // int scaledum=scale; |
1104 | | |
1105 | | __m256i wl16; |
1106 | 0 | if (scale==0) |
1107 | 0 | { |
1108 | 0 | wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); |
1109 | 0 | scale=3; |
1110 | 0 | } |
1111 | 0 | else if (scale==1) |
1112 | 0 | { |
1113 | 0 | wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); |
1114 | 0 | scale=6; |
1115 | 0 | } |
1116 | 0 | else |
1117 | 0 | { |
1118 | 0 | wl16 = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); |
1119 | 0 | scale=12; |
1120 | 0 | } |
1121 | | |
1122 | | |
1123 | | __m256i v32 = _mm256_set1_epi32(32); |
1124 | 0 | for (int y = 0; y<height; y++, pDsty += dstStride) |
1125 | 0 | { |
1126 | 0 | int invAngleSum = 256; |
1127 | 0 | for (int x = 0; x < scale; x++) |
1128 | 0 | { |
1129 | 0 | invAngleSum += invAngle; |
1130 | 0 | ref[x]=refSide[y + (invAngleSum >> 9) + 1]; |
1131 | 0 | } |
1132 | 0 | __m256i xleft= _mm256_load_si256((__m256i*)&ref[0]); |
1133 | 0 | __m256i xdst= _mm256_loadu_si256((__m256i*)pDsty); |
1134 | 0 | __m256i xdstlo=_mm256_sub_epi16 (xleft,xdst); |
1135 | 0 | __m256i tmplo = _mm256_mullo_epi16(xdstlo,wl16); |
1136 | 0 | __m256i tmphi = _mm256_mulhi_epi16(xdstlo,wl16); |
1137 | 0 | xdstlo = _mm256_unpacklo_epi16(tmplo,tmphi); //low |
1138 | 0 | tmphi = _mm256_unpackhi_epi16(tmplo,tmphi); // high |
1139 | |
|
1140 | 0 | tmplo = _mm256_add_epi32(xdstlo,v32); |
1141 | 0 | tmphi = _mm256_add_epi32(tmphi,v32); |
1142 | 0 | tmplo = _mm256_srai_epi32(tmplo,6); |
1143 | 0 | tmphi = _mm256_srai_epi32(tmphi,6); |
1144 | |
|
1145 | 0 | tmplo = _mm256_packs_epi32(tmplo,tmphi); |
1146 | 0 | tmplo = _mm256_permute4x64_epi64 ( tmplo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
1147 | 0 | xdst = _mm256_add_epi16(tmplo,xdst); |
1148 | 0 | _mm256_storeu_si256( ( __m256i * )(pDsty), xdst ); |
1149 | |
|
1150 | 0 | } |
1151 | | #else |
1152 | 0 | for (int y = 0; y<height; y++, pDsty += dstStride) |
1153 | 0 | { |
1154 | 0 | int invAngleSum = 256; |
1155 | 0 | for (int x = 0; x < std::min(3 << scale, width); x++) |
1156 | 0 | { |
1157 | 0 | invAngleSum += invAngle; |
1158 | 0 | int wL = 32 >> (2 * x >> scale); |
1159 | 0 | Pel left = refSide[y + (invAngleSum >> 9) + 1]; |
1160 | 0 | pDsty[x] = pDsty[x] + ((wL * (left - pDsty[x]) + 32) >> 6); |
1161 | 0 | } |
1162 | 0 | } |
1163 | | #endif |
1164 | 0 | } |
1165 | 0 | else |
1166 | 0 | { |
1167 | 0 | ALIGN_DATA( MEMORY_ALIGN_DEF_SIZE,short ref[8]); |
1168 | 0 | VALGRIND_MEMCLEAR( ref, sizeof( ref ) ); |
1169 | |
|
1170 | 0 | __m128i wl16; |
1171 | 0 | if (scale==0) |
1172 | 0 | { |
1173 | 0 | wl16 = _mm_set_epi16(0,0,0,0,0,2,8,32); |
1174 | 0 | scale=3; |
1175 | 0 | } |
1176 | 0 | else if (scale==1) |
1177 | 0 | { |
1178 | 0 | wl16 = _mm_set_epi16(0,0,1,2,4,8,16,32); |
1179 | 0 | scale=6; |
1180 | 0 | } |
1181 | 0 | else |
1182 | 0 | { |
1183 | 0 | wl16 = _mm_set_epi16(4,4,8,8,16,16,32,32); |
1184 | 0 | scale=8; |
1185 | 0 | } |
1186 | |
|
1187 | 0 | int xlim=std::min(scale, width); |
1188 | |
|
1189 | 0 | __m128i v32 = _mm_set1_epi32(32); |
1190 | 0 | for (int y = 0; y<height; y++, pDsty += dstStride) |
1191 | 0 | { |
1192 | 0 | int invAngleSum = 256; |
1193 | 0 | for (int x = 0; x < xlim; x++) |
1194 | 0 | { |
1195 | 0 | invAngleSum += invAngle; |
1196 | 0 | ref[x]=refSide[y + (invAngleSum >> 9) + 1]; |
1197 | 0 | } |
1198 | |
|
1199 | 0 | __m128i xleft; |
1200 | 0 | __m128i xdst; |
1201 | 0 | if (W==8) |
1202 | 0 | { |
1203 | 0 | xleft= _mm_load_si128((__m128i*)&ref[0]); |
1204 | 0 | xdst= _mm_loadu_si128((__m128i*)pDsty); |
1205 | 0 | } |
1206 | 0 | else |
1207 | 0 | { |
1208 | 0 | xleft= _mm_load_si128((__m128i*)&ref[0]); |
1209 | 0 | xdst= _mm_loadu_si64((__m128i*)pDsty); |
1210 | 0 | } |
1211 | 0 | __m128i xdstlo=_mm_sub_epi16 (xleft,xdst); |
1212 | 0 | __m128i tmplo = _mm_mullo_epi16(xdstlo,wl16); |
1213 | 0 | __m128i tmphi = _mm_mulhi_epi16(xdstlo,wl16); |
1214 | 0 | xdstlo = _mm_unpacklo_epi16(tmplo,tmphi); //low |
1215 | 0 | tmphi = _mm_unpackhi_epi16(tmplo,tmphi); // high |
1216 | |
|
1217 | 0 | tmplo = _mm_add_epi32(xdstlo,v32); |
1218 | 0 | tmphi = _mm_add_epi32(tmphi,v32); |
1219 | 0 | tmplo = _mm_srai_epi32(tmplo,6); |
1220 | 0 | tmphi = _mm_srai_epi32(tmphi,6); |
1221 | |
|
1222 | 0 | tmplo = _mm_packs_epi32(tmplo,tmphi); |
1223 | 0 | xdst = _mm_add_epi16(tmplo,xdst); |
1224 | 0 | if (W==8) |
1225 | 0 | _mm_storeu_si128( ( __m128i * )(pDsty), xdst ); |
1226 | 0 | else if (W==4) |
1227 | 0 | _vv_storel_epi64( ( __m128i * )(pDsty), xdst ); |
1228 | 0 | else |
1229 | 0 | { |
1230 | 0 | THROW("wrong blocksize"); |
1231 | 0 | } |
1232 | 0 | } |
1233 | 0 | } |
1234 | 0 | } Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1, 16>(short*, int, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1, 8>(short*, int, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1, 4>(short*, int, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4, 16>(short*, int, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4, 8>(short*, int, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4, 4>(short*, int, short*, int, int, int, int) |
1235 | | |
1236 | | template<X86_VEXT vext > |
1237 | | void IntraAnglePDPC_SIMD(Pel* pDsty,const int dstStride,Pel* refSide,const int width,const int height,int scale,int invAngle) |
1238 | 0 | { |
1239 | 0 | if (width>=16) |
1240 | 0 | IntraAnglePDPC_SIMD<vext,16>(pDsty,dstStride,refSide,width,height,scale,invAngle); |
1241 | 0 | else if (width==8) |
1242 | 0 | IntraAnglePDPC_SIMD<vext,8>(pDsty,dstStride,refSide,width,height,scale,invAngle); |
1243 | 0 | else if (width==4) |
1244 | 0 | IntraAnglePDPC_SIMD<vext,4>(pDsty,dstStride,refSide,width,height,scale,invAngle); |
1245 | 0 | else |
1246 | 0 | for (int y = 0; y<height; y++, pDsty += dstStride) |
1247 | 0 | { |
1248 | 0 | int invAngleSum = 256; |
1249 | 0 | for (int x = 0; x < 2; x++) |
1250 | 0 | { |
1251 | 0 | invAngleSum += invAngle; |
1252 | 0 | int wL = 32 >> (2 * x >> scale); |
1253 | 0 | Pel left = refSide[y + (invAngleSum >> 9) + 1]; |
1254 | 0 | pDsty[x] = pDsty[x] + ((wL * (left - pDsty[x]) + 32) >> 6); |
1255 | 0 | } |
1256 | 0 | } |
1257 | | #if USE_AVX2 |
1258 | | _mm256_zeroupper(); |
1259 | | #endif |
1260 | 0 | } Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int, int) Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int, int) |
1261 | | |
1262 | | template<X86_VEXT vext> |
1263 | | void IntraHorVerPDPC_SIMD(Pel* pDsty,const int dstStride,Pel* refSide,const int width,const int height,int scale,const Pel* refMain, const ClpRng& clpRng) |
1264 | 0 | { |
1265 | 0 | const Pel topLeft = refMain[0]; |
1266 | |
|
1267 | 0 | if (width>=16) |
1268 | 0 | { |
1269 | | #ifdef USE_AVX2 |
1270 | | __m256i v32 = _mm256_set1_epi32(32); |
1271 | | __m256i vbdmin = _mm256_set1_epi16( clpRng.min() ); |
1272 | | __m256i vbdmax = _mm256_set1_epi16( clpRng.max() ); |
1273 | | |
1274 | | __m256i wl16; |
1275 | 0 | if (scale==0) |
1276 | 0 | { |
1277 | 0 | wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); |
1278 | 0 | } |
1279 | 0 | else if (scale==1) |
1280 | 0 | { |
1281 | 0 | wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); |
1282 | 0 | } |
1283 | 0 | else |
1284 | 0 | { |
1285 | 0 | wl16 = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); |
1286 | 0 | } |
1287 | | __m256i xtopLeft = _mm256_set_epi16(topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft); |
1288 | | |
1289 | 0 | for (int y = 0; y<height; y++, pDsty += dstStride) |
1290 | 0 | { |
1291 | | // first column |
1292 | 0 | const Pel left = refSide[1 + y]; |
1293 | 0 | __m256i xleft= _mm256_set_epi16(left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left); |
1294 | |
|
1295 | 0 | __m256i xdst= _mm256_loadu_si256((__m256i*)&refMain[1]); |
1296 | 0 | xleft = _mm256_sub_epi16(xleft,xtopLeft); |
1297 | |
|
1298 | 0 | __m256i tmplo = _mm256_mullo_epi16(xleft,wl16); |
1299 | 0 | __m256i tmphi = _mm256_mulhi_epi16(xleft,wl16); |
1300 | 0 | xleft = _mm256_unpacklo_epi16(tmplo,tmphi); //low |
1301 | 0 | tmphi = _mm256_unpackhi_epi16(tmplo,tmphi); // high |
1302 | |
|
1303 | 0 | tmplo = _mm256_add_epi32(xleft,v32); |
1304 | 0 | tmphi = _mm256_add_epi32(tmphi,v32); |
1305 | 0 | tmplo = _mm256_srai_epi32(tmplo,6); |
1306 | 0 | tmphi = _mm256_srai_epi32(tmphi,6); |
1307 | |
|
1308 | 0 | tmplo = _mm256_packs_epi32(tmplo,tmphi); |
1309 | 0 | tmplo = _mm256_permute4x64_epi64 ( tmplo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
1310 | |
|
1311 | 0 | xdst = _mm256_adds_epi16(tmplo,xdst); |
1312 | 0 | xdst = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, xdst ) ); |
1313 | 0 | _mm256_storeu_si256( ( __m256i * )(pDsty), xdst ); |
1314 | | |
1315 | | // rest memcpy |
1316 | 0 | for (int x = 16; x < width; x+=16) |
1317 | 0 | { |
1318 | 0 | __m256i xdst= _mm256_loadu_si256((__m256i*)&refMain[x+1]); |
1319 | 0 | _mm256_storeu_si256( ( __m256i * )(&pDsty[x]), xdst ); |
1320 | 0 | } |
1321 | 0 | } |
1322 | | #else |
1323 | 0 | for( int y = 0; y < height; y++ ) |
1324 | 0 | { |
1325 | 0 | memcpy(pDsty,&refMain[1],width*sizeof(Pel)); |
1326 | 0 | const Pel left = refSide[1 + y]; |
1327 | 0 | for (int x = 0; x < std::min(3 << scale, width); x++) |
1328 | 0 | { |
1329 | 0 | const int wL = 32 >> (2 * x >> scale); |
1330 | 0 | const Pel val = pDsty[x]; |
1331 | 0 | pDsty[x] = ClipPel(val + ((wL * (left - topLeft) + 32) >> 6), clpRng); |
1332 | 0 | } |
1333 | 0 | pDsty += dstStride; |
1334 | 0 | } |
1335 | | #endif |
1336 | 0 | } |
1337 | 0 | else //width <= 8 |
1338 | 0 | { |
1339 | 0 | __m128i vbdmin = _mm_set1_epi16( clpRng.min() ); |
1340 | 0 | __m128i vbdmax = _mm_set1_epi16( clpRng.max() ); |
1341 | 0 | __m128i wl16; |
1342 | |
|
1343 | 0 | if (scale==0) |
1344 | 0 | { |
1345 | 0 | wl16 = _mm_set_epi16(0,0,0,0,0,2,8,32); |
1346 | 0 | } |
1347 | 0 | else if (scale==1) |
1348 | 0 | { |
1349 | 0 | wl16 = _mm_set_epi16(0,0,1,2,4,8,16,32); |
1350 | 0 | } |
1351 | 0 | else |
1352 | 0 | { |
1353 | 0 | wl16 = _mm_set_epi16(4,4,8,8,16,16,32,32); |
1354 | 0 | } |
1355 | |
|
1356 | 0 | __m128i v32 = _mm_set1_epi32(32); |
1357 | 0 | __m128i xtopLeft = _mm_set_epi16(topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft); |
1358 | |
|
1359 | 0 | for (int y = 0; y<height; y++, pDsty += dstStride) |
1360 | 0 | { |
1361 | | // first column |
1362 | 0 | const Pel left = refSide[1 + y]; |
1363 | 0 | __m128i xleft= _mm_set_epi16(left,left,left,left,left,left,left,left); |
1364 | |
|
1365 | 0 | __m128i xdst= _mm_loadu_si128((__m128i*)&refMain[1]); |
1366 | 0 | xleft = _mm_sub_epi16(xleft,xtopLeft); |
1367 | |
|
1368 | 0 | __m128i tmplo = _mm_mullo_epi16(xleft,wl16); |
1369 | 0 | __m128i tmphi = _mm_mulhi_epi16(xleft,wl16); |
1370 | 0 | xleft = _mm_unpacklo_epi16(tmplo,tmphi); //low |
1371 | 0 | tmphi = _mm_unpackhi_epi16(tmplo,tmphi); // high |
1372 | |
|
1373 | 0 | tmplo = _mm_add_epi32(xleft,v32); |
1374 | 0 | tmphi = _mm_add_epi32(tmphi,v32); |
1375 | 0 | tmplo = _mm_srai_epi32(tmplo,6); |
1376 | 0 | tmphi = _mm_srai_epi32(tmphi,6); |
1377 | |
|
1378 | 0 | tmplo = _mm_packs_epi32(tmplo,tmphi); |
1379 | |
|
1380 | 0 | xdst = _mm_adds_epi16(tmplo,xdst); |
1381 | 0 | xdst = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, xdst ) ); |
1382 | |
|
1383 | 0 | if (width==8) |
1384 | 0 | _mm_storeu_si128( ( __m128i * )(pDsty), xdst ); |
1385 | 0 | else if (width==4) |
1386 | 0 | _vv_storel_epi64( ( __m128i * )(pDsty), xdst ); |
1387 | 0 | else |
1388 | 0 | { |
1389 | 0 | _mm_storeu_si32( ( __m128i * )(pDsty), xdst ); |
1390 | 0 | } |
1391 | 0 | } |
1392 | 0 | } |
1393 | | #if USE_AVX2 |
1394 | | _mm256_zeroupper(); |
1395 | | #endif |
1396 | 0 | } Unexecuted instantiation: void vvenc::IntraHorVerPDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int, short const*, vvenc::ClpRng const&) Unexecuted instantiation: void vvenc::IntraHorVerPDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int, short const*, vvenc::ClpRng const&) |
1397 | | |
1398 | | |
1399 | | template<X86_VEXT vext> |
1400 | | void IntraPrediction::_initIntraPredictionX86() |
1401 | 0 | { |
1402 | 0 | IntraPredAngleLuma = IntraPredAngleLumaCore_SIMD<vext>; |
1403 | 0 | IntraPredAngleChroma = IntraPredAngleChroma_SIMD<vext>; |
1404 | 0 | IntraAnglePDPC = IntraAnglePDPC_SIMD<vext>; |
1405 | 0 | IntraHorVerPDPC = IntraHorVerPDPC_SIMD<vext>; |
1406 | 0 | IntraPredSampleFilter = IntraPredSampleFilter_SIMD<vext>; |
1407 | 0 | xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>; |
1408 | 0 | } Unexecuted instantiation: void vvenc::IntraPrediction::_initIntraPredictionX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::IntraPrediction::_initIntraPredictionX86<(vvenc::x86_simd::X86_VEXT)4>() |
1409 | | template void IntraPrediction::_initIntraPredictionX86<SIMDX86>(); |
1410 | | |
1411 | | } // namespace vvenc |
1412 | | |
1413 | | //! \} |
1414 | | |
1415 | | #endif // TARGET_SIMD_X86 |
1416 | | #endif |
1417 | | //! \} |