/src/vvenc/source/Lib/CommonLib/x86/TrafoX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** \file TrafoX86.h |
43 | | \brief SIMD averaging. |
44 | | */ |
45 | | |
46 | | //! \ingroup CommonLib |
47 | | //! \{ |
48 | | |
49 | | #include "CommonLib/CommonDef.h" |
50 | | #include "CommonDefX86.h" |
51 | | |
52 | | #include "TrQuant.h" |
53 | | #include "TrQuant_EMT.h" |
54 | | |
55 | | #if ENABLE_SIMD_TRAFO |
56 | | #ifdef TARGET_SIMD_X86 |
57 | | |
58 | | namespace vvenc { |
59 | | |
60 | | template<X86_VEXT vext, unsigned trSize> |
61 | | void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows ) |
62 | 0 | { |
63 | 0 | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); |
64 | |
|
65 | | #if USE_AVX2 |
66 | 0 | if( trSize >= 8 && vext >= AVX2 ) |
67 | 0 | { |
68 | 0 | if( ( trSize & 15 ) == 0 ) |
69 | 0 | { |
70 | 0 | static constexpr unsigned trLoops = trSize >> 4 ? trSize >> 4 : 1; |
71 | | |
72 | 0 | for( int k = 0; k < rows; k += 2 ) |
73 | 0 | { |
74 | 0 | TCoeff* dstPtr = dst; |
75 | |
|
76 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; |
77 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
78 | |
|
79 | 0 | __m256i vsrc1v[trLoops][2]; |
80 | | |
81 | | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; |
82 | | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; |
83 | |
|
84 | 0 | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) |
85 | 0 | { |
86 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
87 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
88 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
89 | | #else |
90 | 0 | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
91 | 0 | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
92 | 0 | #endif |
93 | | |
94 | | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); |
95 | | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); |
96 | | } |
97 | | |
98 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
99 | 0 | { |
100 | 0 | __m128i xscale = maxLoopL == 4 |
101 | 0 | ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) ) |
102 | 0 | : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) ); |
103 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
104 | |
|
105 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
106 | | |
107 | 0 | for( int l = 0; l < maxLoopL; l++ ) |
108 | 0 | { |
109 | 0 | __m256i |
110 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); |
111 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); |
112 | |
|
113 | 0 | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) |
114 | 0 | { |
115 | 0 | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); |
116 | |
|
117 | 0 | __m256i |
118 | 0 | vsrc1 = vsrc1v[col][0]; |
119 | 0 | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); |
120 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); |
121 | |
|
122 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); |
123 | | |
124 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); |
125 | |
|
126 | 0 | vsrc1 = vsrc1v[col][1]; |
127 | 0 | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); |
128 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); |
129 | |
|
130 | 0 | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); |
131 | 0 | } |
132 | 0 | } |
133 | 0 | } |
134 | 0 | } |
135 | 0 | } |
136 | 0 | else |
137 | 0 | { |
138 | 0 | for( int k = 0; k < rows; k += 2 ) |
139 | 0 | { |
140 | 0 | TCoeff* dstPtr = dst; |
141 | |
|
142 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; |
143 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
144 | |
|
145 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; |
146 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; |
147 | |
|
148 | 0 | __m256i vit; |
149 | |
|
150 | 0 | { |
151 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
152 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
153 | | #else |
154 | 0 | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
155 | 0 | #endif |
156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
157 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
158 | | #else |
159 | 0 | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
160 | 0 | #endif |
161 | | |
162 | | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); |
163 | | } |
164 | | |
165 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
166 | 0 | { |
167 | 0 | __m128i xscale = maxLoopL == 4 |
168 | 0 | ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) ) |
169 | 0 | : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) ); |
170 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
171 | |
|
172 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
173 | | |
174 | 0 | for( int l = 0; l < maxLoopL; l++ ) |
175 | 0 | { |
176 | 0 | __m256i |
177 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); |
178 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); |
179 | |
|
180 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) |
181 | 0 | { |
182 | 0 | __m256i |
183 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); |
184 | 0 | __m256i |
185 | 0 | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); |
186 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); |
187 | |
|
188 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); |
189 | 0 | } |
190 | 0 | } |
191 | 0 | } |
192 | 0 | } |
193 | 0 | } |
194 | 0 | } |
195 | | #else |
196 | 0 | if( trSize >= 8 ) |
197 | 0 | { |
198 | 0 | for( int k = 0; k < rows; k += 2 ) |
199 | 0 | { |
200 | 0 | TCoeff* dstPtr = dst; |
201 | | |
202 | | const TCoeff* srcPtr0 = &src[ k * lines]; |
203 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
204 | | |
205 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
206 | 0 | { |
207 | 0 | __m128i xscale = maxLoopL == 4 |
208 | 0 | ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) ) |
209 | 0 | : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) ); |
210 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
211 | |
|
212 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
213 | | |
214 | 0 | for( int l = 0; l < maxLoopL; l++ ) |
215 | 0 | { |
216 | 0 | const TMatrixCoeff* itPtr0 = &it[k * trSize]; |
217 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; |
218 | |
|
219 | 0 | __m128i |
220 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); |
221 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); |
222 | |
|
223 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) |
224 | 0 | { |
225 | 0 | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); |
226 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
227 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); |
228 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); |
229 | | #else |
230 | 0 | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); |
231 | 0 | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); |
232 | 0 | #endif |
233 | | |
234 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); |
235 | | |
236 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); |
237 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); |
238 | | |
239 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); |
240 | | |
241 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); |
242 | | |
243 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); |
244 | | |
245 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); |
246 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); |
247 | | |
248 | 0 | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); |
249 | 0 | } |
250 | 0 | } |
251 | 0 | } |
252 | 0 | } |
253 | 0 | } |
254 | 0 | #endif |
255 | 0 | else if( trSize >= 4 ) |
256 | 0 | { |
257 | 0 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); |
258 | |
|
259 | 0 | for( int k = 0; k < rows; k += 2 ) |
260 | 0 | { |
261 | 0 | TCoeff* dstPtr = dst; |
262 | |
|
263 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; |
264 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
265 | |
|
266 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; |
267 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; |
268 | |
|
269 | 0 | __m128i vit = _mm_unpacklo_epi16( _vv_loadl_epi64( ( const __m128i * ) itPtr0 ), _vv_loadl_epi64( ( const __m128i * ) itPtr1 ) ); |
270 | | |
271 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
272 | 0 | { |
273 | 0 | __m128i xscale = maxLoopL == 4 |
274 | 0 | ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) ) |
275 | 0 | : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) ); |
276 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
277 | |
|
278 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
279 | | |
280 | 0 | for( int l = 0; l < maxLoopL; l++ ) |
281 | 0 | { |
282 | 0 | __m128i |
283 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); |
284 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); |
285 | |
|
286 | 0 | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) |
287 | 0 | { |
288 | 0 | __m128i |
289 | 0 | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); |
290 | 0 | __m128i |
291 | 0 | vsrc1 = _mm_madd_epi16 ( vit, vscale ); |
292 | 0 | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); |
293 | |
|
294 | 0 | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); |
295 | 0 | } |
296 | 0 | } |
297 | 0 | } |
298 | 0 | } |
299 | 0 | } |
300 | 0 | else |
301 | 0 | { |
302 | 0 | THROW( "Unsupported size" ); |
303 | 0 | } |
304 | | #if USE_AVX2 |
305 | | |
306 | 0 | _mm256_zeroupper(); |
307 | 0 | #endif |
308 | 0 | } Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) |
309 | | |
310 | | template<X86_VEXT vext, int trSize> |
311 | | void fastFwd_SSE( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, unsigned reducedLine, unsigned cutoff, int shift ) |
312 | 0 | { |
313 | 0 | const int rnd_factor = 1 << ( shift - 1 ); |
314 | | |
315 | | //for( int i = 0; i < reducedLine; i++ ) |
316 | | //{ |
317 | | // TCoeff* dstPtr = dst; |
318 | | // const TMatrixCoeff* iT = tc; |
319 | | // |
320 | | // for( int j = 0; j < cutoff; j++ ) |
321 | | // { |
322 | | // int sum = 0; |
323 | | // |
324 | | // for( int k = 0; k < trSize; k++ ) |
325 | | // { |
326 | | // // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k] |
327 | | // sum += src[k] * iT[k]; |
328 | | // } |
329 | | // |
330 | | // dstPtr[i] = ( sum + rnd_factor ) >> shift; |
331 | | // dstPtr += line; |
332 | | // iT += trSize; |
333 | | // } |
334 | | // |
335 | | // src += trSize; |
336 | | //} |
337 | |
|
338 | 0 | if( trSize >= 8 ) |
339 | 0 | { |
340 | | #if USE_AVX2 |
341 | 0 | if( vext >= AVX2 && ( trSize & 15 ) == 0 ) |
342 | 0 | { |
343 | | #if FIX_FOR_TEMPORARY_COMPILER_ISSUES_ENABLED && defined( __GNUC__ ) && !defined( __clang__ ) |
344 | | #pragma GCC diagnostic push |
345 | | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
346 | | // vsrcarr[2] and vsrcarr[3] might be unitialized for nlx4==0, but in that case they will not be used, so discard the warning! |
347 | | #endif |
348 | 0 | static constexpr unsigned trLoops = trSize >> 4 ? trSize >> 4 : 1; |
349 | | |
350 | | // is number of lines a multiplier of 4 |
351 | 0 | const int nlx4 = reducedLine == 2 ? 0 : 1; |
352 | | |
353 | 0 | for( int i = 0; i < reducedLine; i += ( 2 << nlx4 ) ) |
354 | 0 | { |
355 | 0 | TCoeff* dstPtr = dst + i; |
356 | 0 | const TMatrixCoeff* itPtr = tc; |
357 | | |
358 | 0 | __m256i vsrcarr[trLoops][4]; |
359 | | |
360 | 0 | for( int k = 0; k < trLoops; k++ ) |
361 | 0 | { |
362 | 0 | __m256i vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0] ); |
363 | 0 | __m256i vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8] ); |
364 | 0 | __m256i vsrc = _mm256_packs_epi32( vsrc0, vsrc1 ); |
365 | 0 | vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
366 | |
|
367 | 0 | vsrcarr[k][0] = vsrc; |
368 | | |
369 | 0 | vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0 + trSize] ); |
370 | 0 | vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8 + trSize] ); |
371 | 0 | vsrc = _mm256_packs_epi32( vsrc0, vsrc1 ); |
372 | 0 | vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
373 | |
|
374 | 0 | vsrcarr[k][1] = vsrc; |
375 | |
|
376 | 0 | if( !nlx4 ) continue; |
377 | | |
378 | 0 | vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0 + 2 * trSize] ); |
379 | 0 | vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8 + 2 * trSize] ); |
380 | 0 | vsrc = _mm256_packs_epi32( vsrc0, vsrc1 ); |
381 | 0 | vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
382 | |
|
383 | 0 | vsrcarr[k][2] = vsrc; |
384 | |
|
385 | 0 | vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0 + 3 * trSize] ); |
386 | 0 | vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8 + 3 * trSize] ); |
387 | 0 | vsrc = _mm256_packs_epi32( vsrc0, vsrc1 ); |
388 | 0 | vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
389 | |
|
390 | 0 | vsrcarr[k][3] = vsrc; |
391 | 0 | } |
392 | |
|
393 | 0 | for( int j = 0; j < cutoff; j += 4 ) |
394 | 0 | { |
395 | 0 | __m256i vsum00 = _mm256_setzero_si256(); |
396 | 0 | __m256i vsum02 = _mm256_setzero_si256(); |
397 | |
|
398 | 0 | __m256i vsum10 = _mm256_setzero_si256(); |
399 | 0 | __m256i vsum12 = _mm256_setzero_si256(); |
400 | | |
401 | 0 | __m256i vsum20 = _mm256_setzero_si256(); |
402 | 0 | __m256i vsum22 = _mm256_setzero_si256(); |
403 | |
|
404 | 0 | __m256i vsum30 = _mm256_setzero_si256(); |
405 | 0 | __m256i vsum32 = _mm256_setzero_si256(); |
406 | |
|
407 | 0 | for( int k = 0; k < trLoops; k++ ) |
408 | 0 | { |
409 | | // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k] |
410 | |
|
411 | | #if 0 |
412 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
413 | | __m256i vit0 = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 0 * trSize] ); |
414 | | __m256i vit1 = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 1 * trSize] ); |
415 | | __m256i vit2 = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 2 * trSize] ); |
416 | | __m256i vit3 = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 3 * trSize] ); |
417 | | #else |
418 | | __m256i vit0 = _mm256_stream_load_si256( ( __m256i* ) &itPtr[k + 0 * trSize] ); |
419 | | __m256i vit1 = _mm256_stream_load_si256( ( __m256i* ) &itPtr[k + 1 * trSize] ); |
420 | | __m256i vit2 = _mm256_stream_load_si256( ( __m256i* ) &itPtr[k + 2 * trSize] ); |
421 | | __m256i vit3 = _mm256_stream_load_si256( ( __m256i* ) &itPtr[k + 3 * trSize] ); |
422 | | #endif |
423 | | #else |
424 | 0 | __m256i vit0 = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 0 * trSize] ); |
425 | 0 | __m256i vit1 = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 1 * trSize] ); |
426 | 0 | __m256i vit2 = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 2 * trSize] ); |
427 | 0 | __m256i vit3 = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 3 * trSize] ); |
428 | 0 | #endif |
429 | | |
430 | | __m256i |
431 | | vsrc = vsrcarr[k][0]; |
432 | | |
433 | | vsum00 = _mm256_add_epi32( vsum00, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) ); |
434 | | vsum02 = _mm256_add_epi32( vsum02, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) ); |
435 | | |
436 | | vsrc = vsrcarr[k][1]; |
437 | | |
438 | | vsum10 = _mm256_add_epi32( vsum10, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) ); |
439 | | vsum12 = _mm256_add_epi32( vsum12, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) ); |
440 | | |
441 | | // skip branching |
442 | | //if( !nlx4 ) continue; |
443 | | |
444 | | vsrc = vsrcarr[k][2]; |
445 | | |
446 | | vsum20 = _mm256_add_epi32( vsum20, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) ); |
447 | | vsum22 = _mm256_add_epi32( vsum22, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) ); |
448 | | |
449 | | vsrc = vsrcarr[k][3]; |
450 | | |
451 | | vsum30 = _mm256_add_epi32( vsum30, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) ); |
452 | | vsum32 = _mm256_add_epi32( vsum32, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) ); |
453 | | } |
454 | | |
455 | | vsum00 = _mm256_hadd_epi32( vsum00, vsum02 ); |
456 | | |
457 | | __m128i xsum00 = _mm_add_epi32( _mm256_castsi256_si128( vsum00 ), _mm256_extracti128_si256( vsum00, 1 ) ); |
458 | | xsum00 = _mm_add_epi32 ( xsum00, _mm_set1_epi32( rnd_factor ) ); |
459 | | xsum00 = _mm_srai_epi32( xsum00, shift ); |
460 | | |
461 | | vsum10 = _mm256_hadd_epi32( vsum10, vsum12 ); |
462 | | |
463 | | __m128i xsum10 = _mm_add_epi32( _mm256_castsi256_si128( vsum10 ), _mm256_extracti128_si256( vsum10, 1 ) ); |
464 | | xsum10 = _mm_add_epi32 ( xsum10, _mm_set1_epi32( rnd_factor ) ); |
465 | | xsum10 = _mm_srai_epi32( xsum10, shift ); |
466 | | |
467 | 0 | if( nlx4 ) |
468 | 0 | { |
469 | 0 | vsum20 = _mm256_hadd_epi32( vsum20, vsum22 ); |
470 | |
|
471 | 0 | __m128i xsum20 = _mm_add_epi32( _mm256_castsi256_si128( vsum20 ), _mm256_extracti128_si256( vsum20, 1 ) ); |
472 | 0 | xsum20 = _mm_add_epi32( xsum20, _mm_set1_epi32( rnd_factor ) ); |
473 | 0 | xsum20 = _mm_srai_epi32( xsum20, shift ); |
474 | |
|
475 | 0 | vsum30 = _mm256_hadd_epi32( vsum30, vsum32 ); |
476 | |
|
477 | 0 | __m128i xsum30 = _mm_add_epi32( _mm256_castsi256_si128( vsum30 ), _mm256_extracti128_si256( vsum30, 1 ) ); |
478 | 0 | xsum30 = _mm_add_epi32( xsum30, _mm_set1_epi32( rnd_factor ) ); |
479 | 0 | xsum30 = _mm_srai_epi32( xsum30, shift ); |
480 | |
|
481 | 0 | __m128i xtmp0 = _mm_unpacklo_epi32( xsum00, xsum10 ); |
482 | 0 | __m128i xtmp1 = _mm_unpacklo_epi32( xsum20, xsum30 ); |
483 | |
|
484 | 0 | _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpacklo_epi64( xtmp0, xtmp1 ) ); dstPtr += line; |
485 | 0 | _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp0, xtmp1 ) ); dstPtr += line; |
486 | |
|
487 | 0 | xtmp0 = _mm_unpackhi_epi32( xsum00, xsum10 ); |
488 | 0 | xtmp1 = _mm_unpackhi_epi32( xsum20, xsum30 ); |
489 | |
|
490 | 0 | _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpacklo_epi64( xtmp0, xtmp1 ) ); dstPtr += line; |
491 | 0 | _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp0, xtmp1 ) ); dstPtr += line; |
492 | 0 | } |
493 | 0 | else |
494 | 0 | { |
495 | 0 | __m128i xtmp = _mm_unpacklo_epi32( xsum00, xsum10 ); |
496 | |
|
497 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line; |
498 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp, xtmp ) ); dstPtr += line; |
499 | |
|
500 | 0 | xtmp = _mm_unpackhi_epi32( xsum00, xsum10 ); |
501 | |
|
502 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line; |
503 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp, xtmp ) ); dstPtr += line; |
504 | 0 | } |
505 | | |
506 | | itPtr += ( trSize << 2 ); |
507 | | } |
508 | |
|
509 | 0 | src += ( trSize << ( 1 + nlx4 ) ); |
510 | 0 | } |
511 | | #if FIX_FOR_TEMPORARY_COMPILER_ISSUES_ENABLED && defined( __GNUC__ ) && !defined( __clang__ ) |
512 | | #pragma GCC diagnostic pop |
513 | | #endif |
514 | 0 | } |
515 | 0 | else |
516 | 0 | #endif |
517 | 0 | { |
518 | 0 | static constexpr unsigned trLoops = trSize >> 3 ? trSize >> 3 : 1; |
519 | |
|
520 | 0 | for( int i = 0; i < reducedLine; i += 2 ) |
521 | 0 | { |
522 | 0 | TCoeff* dstPtr = dst + i; |
523 | 0 | const TMatrixCoeff* itPtr = tc; |
524 | | |
525 | 0 | __m128i vsrcarr[trLoops][2]; |
526 | | |
527 | 0 | for( int k = 0; k < trLoops; k++ ) |
528 | 0 | { |
529 | 0 | __m128i vsrc0 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 0] ); |
530 | 0 | __m128i vsrc1 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 4] ); |
531 | 0 | __m128i vsrc = _mm_packs_epi32( vsrc0, vsrc1 ); |
532 | |
|
533 | 0 | vsrcarr[k][0] = vsrc; |
534 | | |
535 | 0 | vsrc0 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 0 + trSize] ); |
536 | 0 | vsrc1 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 4 + trSize] ); |
537 | 0 | vsrc = _mm_packs_epi32( vsrc0, vsrc1 ); |
538 | |
|
539 | 0 | vsrcarr[k][1] = vsrc; |
540 | 0 | } |
541 | |
|
542 | 0 | for( int j = 0; j < cutoff; j += 4 ) |
543 | 0 | { |
544 | 0 | __m128i vsum00 = _mm_setzero_si128(); |
545 | 0 | __m128i vsum02 = _mm_setzero_si128(); |
546 | | |
547 | 0 | __m128i vsum10 = _mm_setzero_si128(); |
548 | 0 | __m128i vsum12 = _mm_setzero_si128(); |
549 | |
|
550 | 0 | for( int k = 0; k < trLoops; k++ ) |
551 | 0 | { |
552 | | // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k] |
553 | |
|
554 | | #if 0 |
555 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
556 | | __m128i vit0 = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 0 * trSize] ); |
557 | | __m128i vit1 = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 1 * trSize] ); |
558 | | __m128i vit2 = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 2 * trSize] ); |
559 | | __m128i vit3 = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 3 * trSize] ); |
560 | | #else |
561 | | __m128i vit0 = _mm_stream_load_si128( ( __m128i* ) &itPtr[k + 0 * trSize] ); |
562 | | __m128i vit1 = _mm_stream_load_si128( ( __m128i* ) &itPtr[k + 1 * trSize] ); |
563 | | __m128i vit2 = _mm_stream_load_si128( ( __m128i* ) &itPtr[k + 2 * trSize] ); |
564 | | __m128i vit3 = _mm_stream_load_si128( ( __m128i* ) &itPtr[k + 3 * trSize] ); |
565 | | #endif |
566 | | #else |
567 | 0 | __m128i vit0 = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 0 * trSize] ); |
568 | 0 | __m128i vit1 = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 1 * trSize] ); |
569 | 0 | __m128i vit2 = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 2 * trSize] ); |
570 | 0 | __m128i vit3 = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 3 * trSize] ); |
571 | 0 | #endif |
572 | | |
573 | | // fist source line |
574 | 0 | __m128i vsrc = vsrcarr[k][0]; |
575 | |
|
576 | 0 | vsum00 = _mm_add_epi32( vsum00, _mm_hadd_epi32( _mm_madd_epi16( vit0, vsrc ), _mm_madd_epi16( vit1, vsrc ) ) ); |
577 | 0 | vsum02 = _mm_add_epi32( vsum02, _mm_hadd_epi32( _mm_madd_epi16( vit2, vsrc ), _mm_madd_epi16( vit3, vsrc ) ) ); |
578 | | |
579 | | // second source line |
580 | 0 | vsrc = vsrcarr[k][1]; |
581 | |
|
582 | 0 | vsum10 = _mm_add_epi32( vsum10, _mm_hadd_epi32( _mm_madd_epi16( vit0, vsrc ), _mm_madd_epi16( vit1, vsrc ) ) ); |
583 | 0 | vsum12 = _mm_add_epi32( vsum12, _mm_hadd_epi32( _mm_madd_epi16( vit2, vsrc ), _mm_madd_epi16( vit3, vsrc ) ) ); |
584 | 0 | } |
585 | |
|
586 | 0 | vsum00 = _mm_hadd_epi32( vsum00, vsum02 ); |
587 | 0 | vsum00 = _mm_add_epi32 ( vsum00, _mm_set1_epi32( rnd_factor ) ); |
588 | 0 | vsum00 = _mm_srai_epi32( vsum00, shift ); |
589 | |
|
590 | 0 | vsum10 = _mm_hadd_epi32( vsum10, vsum12 ); |
591 | 0 | vsum10 = _mm_add_epi32 ( vsum10, _mm_set1_epi32( rnd_factor ) ); |
592 | 0 | vsum10 = _mm_srai_epi32( vsum10, shift ); |
593 | |
|
594 | 0 | __m128i xtmp = _mm_unpacklo_epi32( vsum00, vsum10 ); |
595 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line; |
596 | |
|
597 | 0 | xtmp = _mm_shuffle_epi32( xtmp, ( 2 << 0 ) + ( 3 << 2 ) ); |
598 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line; |
599 | | |
600 | 0 | xtmp = _mm_unpackhi_epi32( vsum00, vsum10 ); |
601 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line; |
602 | |
|
603 | 0 | xtmp = _mm_shuffle_epi32( xtmp, ( 2 << 0 ) + ( 3 << 2 ) ); |
604 | 0 | _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line; |
605 | |
|
606 | 0 | itPtr += ( trSize << 2 ); |
607 | 0 | } |
608 | |
|
609 | 0 | src += ( trSize << 1 ); |
610 | 0 | } |
611 | 0 | } |
612 | 0 | } |
613 | 0 | else |
614 | 0 | { |
615 | 0 | __m128i vzero = _mm_setzero_si128(); |
616 | |
|
617 | 0 | for( int i = 0; i < reducedLine; i++ ) |
618 | 0 | { |
619 | 0 | TCoeff* dstPtr = dst; |
620 | 0 | const TMatrixCoeff* itPtr = tc; |
621 | |
|
622 | 0 | for( int j = 0; j < cutoff; j++ ) |
623 | 0 | { |
624 | 0 | __m128i vit = _vv_loadl_epi64( ( const __m128i* ) itPtr ); |
625 | 0 | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i* ) src ); |
626 | |
|
627 | 0 | __m128i vsrc = _mm_packs_epi32( vsrc0, vzero ); |
628 | 0 | __m128i vsum = _mm_madd_epi16 ( vit, vsrc ); |
629 | |
|
630 | 0 | dstPtr[i] = ( _mm_extract_epi32( vsum, 0 ) + _mm_extract_epi32( vsum, 1 ) + rnd_factor ) >> shift; |
631 | |
|
632 | 0 | dstPtr += line; |
633 | 0 | itPtr += trSize; |
634 | 0 | } |
635 | |
|
636 | 0 | src += trSize; |
637 | 0 | } |
638 | 0 | } |
639 | | #if USE_AVX2 |
640 | | |
641 | | _mm256_zeroupper(); |
642 | | #endif |
643 | 0 | } Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) |
644 | | |
645 | | template< X86_VEXT vext, int W > |
646 | | void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift ) |
647 | 0 | { |
648 | | #if USE_AVX2 |
649 | 0 | if( W >= 8 && vext >= AVX2 ) |
650 | 0 | { |
651 | 0 | __m256i vmin = _mm256_set1_epi32( outputMin ); |
652 | 0 | __m256i vmax = _mm256_set1_epi32( outputMax ); |
653 | 0 | __m256i vrnd = _mm256_set1_epi32( round ); |
654 | | |
655 | 0 | while( height-- ) |
656 | 0 | { |
657 | 0 | for( int col = 0; col < width; col += 8 ) |
658 | 0 | { |
659 | 0 | __m256i |
660 | 0 | vdst = _mm256_load_si256( ( __m256i * ) &dst[col] ); |
661 | 0 | vdst = _mm256_add_epi32 ( vdst, vrnd ); |
662 | 0 | vdst = _mm256_srai_epi32( vdst, shift ); |
663 | 0 | vdst = _mm256_max_epi32 ( vdst, vmin ); |
664 | 0 | vdst = _mm256_min_epi32 ( vdst, vmax ); |
665 | 0 | _mm256_store_si256 ( ( __m256i * ) &dst[col], vdst ); |
666 | 0 | } |
667 | |
|
668 | 0 | dst += stride; |
669 | 0 | } |
670 | 0 | } |
671 | 0 | else |
672 | 0 | #endif |
673 | 0 | if( W >= 4 ) |
674 | 0 | { |
675 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); |
676 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); |
677 | 0 | __m128i vrnd = _mm_set1_epi32( round ); |
678 | |
|
679 | 0 | while( height-- ) |
680 | 0 | { |
681 | 0 | for( int col = 0; col < width; col += 4 ) |
682 | 0 | { |
683 | 0 | __m128i |
684 | 0 | vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] ); |
685 | 0 | vdst = _mm_add_epi32 ( vdst, vrnd ); |
686 | 0 | vdst = _mm_srai_epi32 ( vdst, shift ); |
687 | 0 | vdst = _mm_max_epi32 ( vdst, vmin ); |
688 | 0 | vdst = _mm_min_epi32 ( vdst, vmax ); |
689 | 0 | _mm_store_si128 ( ( __m128i * ) &dst[col], vdst ); |
690 | 0 | } |
691 | |
|
692 | 0 | dst += stride; |
693 | 0 | } |
694 | 0 | } |
695 | 0 | else |
696 | 0 | { |
697 | 0 | THROW( "Unsupported size" ); |
698 | 0 | } |
699 | | #if USE_AVX2 |
700 | | |
701 | 0 | _mm256_zeroupper(); |
702 | 0 | #endif |
703 | 0 | } Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) |
704 | | |
705 | | template< X86_VEXT vext, int W > |
706 | | void cpyResi_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height ) |
707 | 0 | { |
708 | | #if USE_AVX2 |
709 | 0 | if( W >= 8 && vext >= AVX2 ) |
710 | 0 | { |
711 | 0 | while( height-- ) |
712 | 0 | { |
713 | 0 | for( int col = 0; col < width; col += 8 ) |
714 | 0 | { |
715 | 0 | __m256i |
716 | 0 | vsrc = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); |
717 | 0 | __m128i |
718 | 0 | vdst = _mm_packs_epi32 ( _mm256_castsi256_si128( vsrc ), _mm256_extracti128_si256( vsrc, 1 ) ); |
719 | 0 | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); |
720 | 0 | } |
721 | |
|
722 | 0 | src += width; |
723 | 0 | dst += stride; |
724 | 0 | } |
725 | 0 | } |
726 | 0 | else |
727 | 0 | #endif |
728 | 0 | if( W >= 4 ) |
729 | 0 | { |
730 | 0 | __m128i vzero = _mm_setzero_si128(); |
731 | 0 | __m128i vdst; |
732 | |
|
733 | 0 | while( height-- ) |
734 | 0 | { |
735 | 0 | for( int col = 0; col < width; col += 4 ) |
736 | 0 | { |
737 | 0 | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); |
738 | 0 | vdst = _mm_packs_epi32( vdst, vzero ); |
739 | 0 | _vv_storel_epi64 ( ( __m128i * ) &dst[col], vdst ); |
740 | 0 | } |
741 | |
|
742 | 0 | src += width; |
743 | 0 | dst += stride; |
744 | 0 | } |
745 | 0 | } |
746 | 0 | else |
747 | 0 | { |
748 | 0 | THROW( "Unsupported size" ); |
749 | 0 | } |
750 | | #if USE_AVX2 |
751 | | |
752 | 0 | _mm256_zeroupper(); |
753 | 0 | #endif |
754 | 0 | } Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(int const*, short*, long, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(int const*, short*, long, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(int const*, short*, long, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(int const*, short*, long, unsigned int, unsigned int) |
755 | | |
756 | | template< X86_VEXT vext, int W > |
757 | | void cpyCoeff_SSE( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height ) |
758 | 0 | { |
759 | | #if USE_AVX2 |
760 | 0 | if( W >= 8 && vext >= AVX2 ) |
761 | 0 | { |
762 | 0 | while( height-- ) |
763 | 0 | { |
764 | 0 | for( int col = 0; col < width; col += 8 ) |
765 | 0 | { |
766 | 0 | __m256i vtmp = _mm256_cvtepi16_epi32( _mm_loadu_si128( ( const __m128i * ) &src[col] ) ); |
767 | 0 | _mm256_store_si256( ( __m256i * ) &dst[col], vtmp ); |
768 | 0 | } |
769 | |
|
770 | 0 | src += stride; |
771 | 0 | dst += width; |
772 | 0 | } |
773 | 0 | } |
774 | 0 | else |
775 | 0 | #endif |
776 | 0 | if( W >= 4 ) |
777 | 0 | { |
778 | 0 | while( height-- ) |
779 | 0 | { |
780 | 0 | for( int col = 0; col < width; col += 4 ) |
781 | 0 | { |
782 | 0 | __m128i vtmp = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( const __m128i * ) &src[col] ) ); |
783 | 0 | _mm_store_si128( ( __m128i * ) &dst[col], vtmp ); |
784 | 0 | } |
785 | |
|
786 | 0 | src += stride; |
787 | 0 | dst += width; |
788 | 0 | } |
789 | 0 | } |
790 | 0 | else |
791 | 0 | { |
792 | 0 | THROW( "Unsupported size" ); |
793 | 0 | } |
794 | | #if USE_AVX2 |
795 | | |
796 | 0 | _mm256_zeroupper(); |
797 | 0 | #endif |
798 | 0 | } Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, long, int*, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, long, int*, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, long, int*, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, long, int*, unsigned int, unsigned int) |
799 | | |
800 | | template<X86_VEXT vext> |
801 | | void simdInvLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ) |
802 | 0 | { |
803 | 0 | CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" ); |
804 | |
|
805 | 0 | static constexpr int maxLog2TrDynamicRange = 15; |
806 | 0 | const TCoeff outputMinimum = -( 1 << maxLog2TrDynamicRange ); |
807 | 0 | const TCoeff outputMaximum = ( 1 << maxLog2TrDynamicRange ) - 1; |
808 | 0 | const int8_t* trMat = ( size > 4 ) ? g_lfnstInv8x8[mode][index][0] : g_lfnstInv4x4[mode][index][0]; |
809 | 0 | const int trSize = ( size > 4 ) ? 48 : 16; |
810 | 0 | int* out = dst; |
811 | |
|
812 | 0 | const __m128i vzero = _mm_setzero_si128(); |
813 | 0 | const __m128i vmin = _mm_set1_epi32( outputMinimum ); |
814 | 0 | const __m128i vmax = _mm_set1_epi32( outputMaximum ); |
815 | |
|
816 | 0 | for( int j = 0; j < trSize; j += 4, out += 4 ) |
817 | 0 | { |
818 | 0 | __m128i vsum[4]; |
819 | |
|
820 | 0 | for( int k = 0; k < 4; k++, trMat += 16 ) |
821 | 0 | { |
822 | 0 | const int8_t* trMatTmp = trMat; |
823 | 0 | int* srcPtr = src; |
824 | |
|
825 | 0 | __m128i vsrc; |
826 | 0 | __m128i vtr; |
827 | 0 | __m128i vtmp; |
828 | 0 | __m128i vcur = vzero; |
829 | |
|
830 | 0 | for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 ) |
831 | 0 | { |
832 | 0 | vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr ); |
833 | 0 | vtr = _vv_loadl_epi64( ( const __m128i* ) trMatTmp ); |
834 | 0 | vtr = _mm_cvtepi8_epi16( vtr ); |
835 | 0 | vtmp = _mm_cvtepi16_epi32( vtr ); |
836 | |
|
837 | 0 | vtmp = _mm_mullo_epi32( vsrc, vtmp ); |
838 | 0 | vcur = _mm_add_epi32( vtmp, vcur ); |
839 | |
|
840 | 0 | vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] ); |
841 | 0 | vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) ); |
842 | | |
843 | 0 | vtmp = _mm_mullo_epi32( vsrc, vtmp ); |
844 | 0 | vcur = _mm_add_epi32( vtmp, vcur ); |
845 | 0 | } |
846 | |
|
847 | 0 | vsum[k] = vcur; |
848 | 0 | } |
849 | |
|
850 | 0 | __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) ); |
851 | 0 | vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) ); |
852 | 0 | vout = _mm_srai_epi32( vout, 7 ); |
853 | 0 | vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax ); |
854 | |
|
855 | 0 | _mm_storeu_si128( ( __m128i* ) out, vout ); |
856 | 0 | } |
857 | 0 | } Unexecuted instantiation: void vvenc::simdInvLfnstNxN<(vvenc::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::simdInvLfnstNxN<(vvenc::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int) |
858 | | |
859 | | template<X86_VEXT vext> |
860 | | void simdFwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ) |
861 | 0 | { |
862 | 0 | const int8_t *trMat = ( size > 4 ) ? g_lfnstFwd8x8[mode][index][0] : g_lfnstFwd4x4[mode][index][0]; |
863 | 0 | const int trSize = ( size > 4 ) ? 48 : 16; |
864 | 0 | int * out = dst; |
865 | |
|
866 | 0 | const __m128i vzero = _mm_setzero_si128(); |
867 | |
|
868 | 0 | for( int j = 0; j < zeroOutSize; j += 4, out += 4 ) |
869 | 0 | { |
870 | 0 | __m128i vout[4]; |
871 | |
|
872 | 0 | for( int k = 0; k < 4; k++ ) |
873 | 0 | { |
874 | 0 | int* srcPtr = src; |
875 | 0 | const int8_t* trMatTmp = trMat; |
876 | |
|
877 | 0 | __m128i vsum = vzero; |
878 | |
|
879 | 0 | for( int i = 0; i < trSize; i += 16, srcPtr += 16, trMatTmp += 16 ) |
880 | 0 | { |
881 | 0 | __m128i vtrc = _mm_loadu_si128( ( const __m128i* ) trMatTmp ); |
882 | 0 | __m128i vtrl = _mm_cvtepi8_epi16( vtrc ); |
883 | 0 | __m128i vtrh = _mm_cvtepi8_epi16( _mm_unpackhi_epi64( vtrc, vzero ) ); |
884 | |
|
885 | 0 | __m128i vsrc0 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[0] ); |
886 | 0 | vtrc = _mm_cvtepi16_epi32( vtrl ); |
887 | 0 | vsrc0 = _mm_mullo_epi32( vsrc0, vtrc ); |
888 | | |
889 | 0 | __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] ); |
890 | 0 | vtrc = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtrl, vzero ) ); |
891 | 0 | vsrc1 = _mm_mullo_epi32( vsrc1, vtrc ); |
892 | | |
893 | 0 | __m128i vsrc2 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[8] ); |
894 | 0 | vtrc = _mm_cvtepi16_epi32( vtrh ); |
895 | 0 | vsrc2 = _mm_mullo_epi32( vsrc2, vtrc ); |
896 | | |
897 | 0 | __m128i vsrc3 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[12] ); |
898 | 0 | vtrc = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtrh, vzero ) ); |
899 | 0 | vsrc3 = _mm_mullo_epi32( vsrc3, vtrc ); |
900 | |
|
901 | 0 | vsrc0 = _mm_add_epi32( vsrc0, vsrc1 ); |
902 | 0 | vsrc2 = _mm_add_epi32( vsrc2, vsrc3 ); |
903 | |
|
904 | 0 | vsum = _mm_add_epi32( vsum, _mm_add_epi32( vsrc0, vsrc2 ) ); |
905 | 0 | } |
906 | |
|
907 | 0 | vout[k] = vsum; |
908 | 0 | trMat += trSize; |
909 | 0 | } |
910 | |
|
911 | 0 | __m128i vdst = _mm_hadd_epi32( _mm_hadd_epi32( vout[0], vout[1] ), _mm_hadd_epi32( vout[2], vout[3] ) ); |
912 | 0 | vdst = _mm_add_epi32( vdst, _mm_set1_epi32( 64 ) ); |
913 | 0 | vdst = _mm_srai_epi32( vdst, 7 ); |
914 | |
|
915 | 0 | _mm_storeu_si128( ( __m128i* ) out, vdst ); |
916 | 0 | } |
917 | |
|
918 | 0 | ::memset( out, 0, ( trSize - zeroOutSize ) * sizeof( int ) ); |
919 | 0 | } Unexecuted instantiation: void vvenc::simdFwdLfnstNxN<(vvenc::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::simdFwdLfnstNxN<(vvenc::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int) |
920 | | |
921 | | template<X86_VEXT vext> |
922 | | void TCoeffOps::_initTCoeffOpsX86() |
923 | 0 | { |
924 | 0 | cpyResi4 = cpyResi_SSE <vext, 4>; |
925 | 0 | cpyResi8 = cpyResi_SSE <vext, 8>; |
926 | 0 | cpyCoeff4 = cpyCoeff_SSE <vext, 4>; |
927 | 0 | cpyCoeff8 = cpyCoeff_SSE <vext, 8>; |
928 | 0 | roundClip4 = roundClip_SSE<vext, 4>; |
929 | 0 | roundClip8 = roundClip_SSE<vext, 8>; |
930 | |
|
931 | 0 | fastInvCore[0] = fastInv_SSE<vext, 4>; |
932 | 0 | fastInvCore[1] = fastInv_SSE<vext, 8>; |
933 | 0 | fastInvCore[2] = fastInv_SSE<vext, 16>; |
934 | 0 | fastInvCore[3] = fastInv_SSE<vext, 32>; |
935 | 0 | fastInvCore[4] = fastInv_SSE<vext, 64>; |
936 | |
|
937 | 0 | fastFwdCore_2D[0] = fastFwd_SSE<vext, 4>; |
938 | 0 | fastFwdCore_2D[1] = fastFwd_SSE<vext, 8>; |
939 | 0 | fastFwdCore_2D[2] = fastFwd_SSE<vext, 16>; |
940 | 0 | fastFwdCore_2D[3] = fastFwd_SSE<vext, 32>; |
941 | 0 | fastFwdCore_2D[4] = fastFwd_SSE<vext, 64>; |
942 | 0 | } Unexecuted instantiation: void vvenc::TCoeffOps::_initTCoeffOpsX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::TCoeffOps::_initTCoeffOpsX86<(vvenc::x86_simd::X86_VEXT)4>() |
943 | | |
944 | | template<X86_VEXT vext> |
945 | | void TrQuant::_initTrQuantX86() |
946 | 0 | { |
947 | 0 | m_invLfnstNxN = simdInvLfnstNxN<vext>; |
948 | 0 | m_fwdLfnstNxN = simdFwdLfnstNxN<vext>; |
949 | 0 | } Unexecuted instantiation: void vvenc::TrQuant::_initTrQuantX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::TrQuant::_initTrQuantX86<(vvenc::x86_simd::X86_VEXT)4>() |
950 | | |
951 | | template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>(); |
952 | | template void TrQuant::_initTrQuantX86<SIMDX86>(); |
953 | | |
954 | | } |
955 | | |
956 | | #endif // TARGET_SIMD_X86 |
957 | | #endif |
958 | | //! \} |