/src/vvdec/source/Lib/CommonLib/x86/TrafoX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | /** \file TrafoX86.h |
44 | | \brief SIMD trafo |
45 | | */ |
46 | | |
47 | | //! \ingroup CommonLib |
48 | | //! \{ |
49 | | |
50 | | |
51 | | #include "CommonLib/CommonDef.h" |
52 | | #include "CommonLib/Rom.h" |
53 | | |
54 | | #include "CommonDefX86.h" |
55 | | |
56 | | #include "TrQuant.h" |
57 | | #include "TrQuant_EMT.h" |
58 | | |
59 | | namespace vvdec |
60 | | { |
61 | | |
62 | | #if ENABLE_SIMD_TCOEFF_OPS |
63 | | #ifdef TARGET_SIMD_X86 |
64 | | |
65 | | template< X86_VEXT vext, int trSize > |
66 | | void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows ) |
67 | 37.4k | { |
68 | 37.4k | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); |
69 | | |
70 | | #if USE_AVX2 |
71 | 37.4k | if( trSize >= 8 && vext >= AVX2 ) |
72 | 36.7k | { |
73 | 36.7k | if( ( trSize & 15 ) == 0 ) |
74 | 29.9k | { |
75 | 29.9k | static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1; |
76 | | |
77 | 136k | for( int k = 0; k < rows; k += 2 ) |
78 | 106k | { |
79 | 106k | TCoeff* dstPtr = dst; |
80 | | |
81 | 106k | const TCoeff* srcPtr0 = &src[ k * lines]; |
82 | 106k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
83 | | |
84 | 106k | __m256i vsrc1v[trLoops][2]; |
85 | | |
86 | | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; |
87 | | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; |
88 | | |
89 | 307k | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) |
90 | 201k | { |
91 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
92 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
93 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
94 | | #else |
95 | 201k | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
96 | 201k | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
97 | 201k | #endif |
98 | | |
99 | | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); |
100 | | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); |
101 | | } |
102 | | |
103 | 523k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
104 | 417k | { |
105 | 417k | __m128i xscale = maxLoopL == 4 |
106 | 417k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) |
107 | 417k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); |
108 | 417k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
109 | | |
110 | 417k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
111 | | |
112 | 1.74M | for( int l = 0; l < maxLoopL; l++ ) |
113 | 1.39M | { |
114 | 1.39M | __m256i |
115 | 1.39M | vscale = _mm256_broadcastd_epi32( xscale ); |
116 | 1.39M | xscale = _mm_bsrli_si128( xscale, 4 ); |
117 | | |
118 | 4.47M | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) |
119 | 3.08M | { |
120 | 3.08M | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); |
121 | | |
122 | 3.08M | __m256i |
123 | 3.08M | vsrc1 = vsrc1v[col][0]; |
124 | 3.08M | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); |
125 | 3.08M | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); |
126 | | |
127 | 3.08M | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); |
128 | | |
129 | 3.08M | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); |
130 | | |
131 | 3.08M | vsrc1 = vsrc1v[col][1]; |
132 | 3.08M | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); |
133 | 3.08M | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); |
134 | | |
135 | 3.08M | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); |
136 | 3.08M | } |
137 | 1.39M | } |
138 | 349k | } |
139 | 106k | } |
140 | 29.9k | } |
141 | 6.82k | else |
142 | 6.82k | { |
143 | 26.8k | for( int k = 0; k < rows; k += 2 ) |
144 | 19.9k | { |
145 | 19.9k | TCoeff* dstPtr = dst; |
146 | | |
147 | 19.9k | const TCoeff* srcPtr0 = &src[ k * lines]; |
148 | 19.9k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
149 | | |
150 | 19.9k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; |
151 | 19.9k | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; |
152 | | |
153 | 19.9k | __m256i vit; |
154 | | |
155 | 19.9k | { |
156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
157 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
158 | | #else |
159 | 19.9k | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
160 | 19.9k | #endif |
161 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
162 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
163 | | #else |
164 | 19.9k | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); |
165 | 19.9k | #endif |
166 | | |
167 | | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); |
168 | | } |
169 | | |
170 | 67.9k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
171 | 48.0k | { |
172 | 48.0k | __m128i xscale = maxLoopL == 4 |
173 | 48.0k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) |
174 | 48.0k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); |
175 | 48.0k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
176 | | |
177 | 48.0k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
178 | | |
179 | 197k | for( int l = 0; l < maxLoopL; l++ ) |
180 | 157k | { |
181 | 157k | __m256i |
182 | 157k | vscale = _mm256_broadcastd_epi32( xscale ); |
183 | 157k | xscale = _mm_bsrli_si128( xscale, 4 ); |
184 | | |
185 | 315k | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) |
186 | 157k | { |
187 | 157k | __m256i |
188 | 157k | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); |
189 | 157k | __m256i |
190 | 157k | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); |
191 | 157k | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); |
192 | | |
193 | 157k | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); |
194 | 157k | } |
195 | 157k | } |
196 | 39.6k | } |
197 | 19.9k | } |
198 | 6.82k | } |
199 | 36.7k | } |
200 | | #else |
201 | 0 | if( trSize >= 8 ) |
202 | 0 | { |
203 | 0 | for( int k = 0; k < rows; k += 2 ) |
204 | 0 | { |
205 | 0 | TCoeff* dstPtr = dst; |
206 | | |
207 | | const TCoeff* srcPtr0 = &src[ k * lines]; |
208 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
209 | | |
210 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
211 | 0 | { |
212 | 0 | __m128i xscale = maxLoopL == 4 |
213 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) |
214 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); |
215 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
216 | |
|
217 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
218 | | |
219 | 0 | for( int l = 0; l < maxLoopL; l++ ) |
220 | 0 | { |
221 | 0 | const TMatrixCoeff* itPtr0 = &it[k * trSize]; |
222 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; |
223 | |
|
224 | 0 | __m128i |
225 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); |
226 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); |
227 | |
|
228 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) |
229 | 0 | { |
230 | 0 | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); |
231 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 |
232 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); |
233 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); |
234 | | #else |
235 | 0 | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); |
236 | 0 | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); |
237 | 0 | #endif |
238 | | |
239 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); |
240 | | |
241 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); |
242 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); |
243 | | |
244 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); |
245 | | |
246 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); |
247 | | |
248 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); |
249 | | |
250 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); |
251 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); |
252 | | |
253 | 0 | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); |
254 | 0 | } |
255 | 0 | } |
256 | 0 | } |
257 | 0 | } |
258 | 0 | } |
259 | 0 | #endif |
260 | 690 | else if( trSize >= 4 ) |
261 | 690 | { |
262 | 690 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); |
263 | | |
264 | 1.88k | for( int k = 0; k < rows; k += 2 ) |
265 | 1.19k | { |
266 | 1.19k | TCoeff* dstPtr = dst; |
267 | | |
268 | 1.19k | const TCoeff* srcPtr0 = &src[ k * lines]; |
269 | 1.19k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; |
270 | | |
271 | 1.19k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; |
272 | 1.19k | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; |
273 | | |
274 | 1.19k | __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) ); |
275 | | |
276 | 4.83k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) |
277 | 3.63k | { |
278 | 3.63k | __m128i xscale = maxLoopL == 4 |
279 | 3.63k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) |
280 | 3.63k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); |
281 | 3.63k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); |
282 | | |
283 | 3.63k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } |
284 | | |
285 | 14.3k | for( int l = 0; l < maxLoopL; l++ ) |
286 | 11.4k | { |
287 | 11.4k | __m128i |
288 | 11.4k | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); |
289 | 11.4k | xscale = _mm_bsrli_si128( xscale, 4 ); |
290 | | |
291 | 22.8k | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) |
292 | 11.4k | { |
293 | 11.4k | __m128i |
294 | 11.4k | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); |
295 | 11.4k | __m128i |
296 | 11.4k | vsrc1 = _mm_madd_epi16 ( vit, vscale ); |
297 | 11.4k | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); |
298 | | |
299 | 11.4k | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); |
300 | 11.4k | } |
301 | 11.4k | } |
302 | 2.93k | } |
303 | 1.19k | } |
304 | 690 | } |
305 | 0 | else |
306 | 0 | { |
307 | 0 | THROW_FATAL( "Unsupported size" ); |
308 | 0 | } |
309 | | #if USE_AVX2 |
310 | | |
311 | 37.4k | _mm256_zeroupper(); |
312 | 37.4k | #endif |
313 | 37.4k | } Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Line | Count | Source | 67 | 690 | { | 68 | 690 | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); | 69 | | | 70 | 690 | #if USE_AVX2 | 71 | 690 | if( trSize >= 8 && vext >= AVX2 ) | 72 | 0 | { | 73 | 0 | if( ( trSize & 15 ) == 0 ) | 74 | 0 | { | 75 | 0 | static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1; | 76 | |
| 77 | 0 | for( int k = 0; k < rows; k += 2 ) | 78 | 0 | { | 79 | 0 | TCoeff* dstPtr = dst; | 80 | |
| 81 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 82 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 83 | |
| 84 | 0 | __m256i vsrc1v[trLoops][2]; | 85 | | | 86 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 87 | 0 | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; | 88 | |
| 89 | 0 | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) | 90 | 0 | { | 91 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 92 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 93 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 94 | | #else | 95 | 0 | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 96 | 0 | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 97 | 0 | #endif | 98 | |
| 99 | 0 | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); | 100 | 0 | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); | 101 | 0 | } | 102 | |
| 103 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 104 | 0 | { | 105 | 0 | __m128i xscale = maxLoopL == 4 | 106 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 107 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 108 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 109 | |
| 110 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 111 | | | 112 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 113 | 0 | { | 114 | 0 | __m256i | 115 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); | 116 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 117 | |
| 118 | 0 | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) | 119 | 0 | { | 120 | 0 | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 121 | |
| 122 | 0 | __m256i | 123 | 0 | vsrc1 = vsrc1v[col][0]; | 124 | 0 | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 125 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 126 | |
| 127 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 128 | | | 129 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); | 130 | |
| 131 | 0 | vsrc1 = vsrc1v[col][1]; | 132 | 0 | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 133 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 134 | |
| 135 | 0 | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); | 136 | 0 | } | 137 | 0 | } | 138 | 0 | } | 139 | 0 | } | 140 | 0 | } | 141 | 0 | else | 142 | 0 | { | 143 | 0 | for( int k = 0; k < rows; k += 2 ) | 144 | 0 | { | 145 | 0 | TCoeff* dstPtr = dst; | 146 | |
| 147 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 148 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 149 | |
| 150 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 151 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 152 | |
| 153 | 0 | __m256i vit; | 154 | |
| 155 | 0 | { | 156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 157 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 158 | | #else | 159 | 0 | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 160 | 0 | #endif | 161 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 162 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 163 | | #else | 164 | 0 | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 165 | 0 | #endif | 166 | |
| 167 | 0 | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); | 168 | 0 | } | 169 | | | 170 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 171 | 0 | { | 172 | 0 | __m128i xscale = maxLoopL == 4 | 173 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 174 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 175 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 176 | |
| 177 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 178 | | | 179 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 180 | 0 | { | 181 | 0 | __m256i | 182 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); | 183 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 184 | |
| 185 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 186 | 0 | { | 187 | 0 | __m256i | 188 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 189 | 0 | __m256i | 190 | 0 | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); | 191 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 192 | |
| 193 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 194 | 0 | } | 195 | 0 | } | 196 | 0 | } | 197 | 0 | } | 198 | 0 | } | 199 | 0 | } | 200 | | #else | 201 | | if( trSize >= 8 ) | 202 | | { | 203 | | for( int k = 0; k < rows; k += 2 ) | 204 | | { | 205 | | TCoeff* dstPtr = dst; | 206 | | | 207 | | const TCoeff* srcPtr0 = &src[ k * lines]; | 208 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 209 | | | 210 | | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 211 | | { | 212 | | __m128i xscale = maxLoopL == 4 | 213 | | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 214 | | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 215 | | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 216 | | | 217 | | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 218 | | | 219 | | for( int l = 0; l < maxLoopL; l++ ) | 220 | | { | 221 | | const TMatrixCoeff* itPtr0 = &it[k * trSize]; | 222 | | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 223 | | | 224 | | __m128i | 225 | | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 226 | | xscale = _mm_bsrli_si128( xscale, 4 ); | 227 | | | 228 | | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 229 | | { | 230 | | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 231 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 232 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); | 233 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); | 234 | | #else | 235 | | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); | 236 | | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); | 237 | | #endif | 238 | | | 239 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); | 240 | | | 241 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 242 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 243 | | | 244 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 245 | | | 246 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); | 247 | | | 248 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); | 249 | | | 250 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 251 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 252 | | | 253 | | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); | 254 | | } | 255 | | } | 256 | | } | 257 | | } | 258 | | } | 259 | | #endif | 260 | 690 | else if( trSize >= 4 ) | 261 | 690 | { | 262 | 690 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); | 263 | | | 264 | 1.88k | for( int k = 0; k < rows; k += 2 ) | 265 | 1.19k | { | 266 | 1.19k | TCoeff* dstPtr = dst; | 267 | | | 268 | 1.19k | const TCoeff* srcPtr0 = &src[ k * lines]; | 269 | 1.19k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 270 | | | 271 | 1.19k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 272 | 1.19k | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 273 | | | 274 | 1.19k | __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) ); | 275 | | | 276 | 4.83k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 277 | 3.63k | { | 278 | 3.63k | __m128i xscale = maxLoopL == 4 | 279 | 3.63k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 280 | 3.63k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 281 | 3.63k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 282 | | | 283 | 3.63k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 284 | | | 285 | 14.3k | for( int l = 0; l < maxLoopL; l++ ) | 286 | 11.4k | { | 287 | 11.4k | __m128i | 288 | 11.4k | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 289 | 11.4k | xscale = _mm_bsrli_si128( xscale, 4 ); | 290 | | | 291 | 22.8k | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) | 292 | 11.4k | { | 293 | 11.4k | __m128i | 294 | 11.4k | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 295 | 11.4k | __m128i | 296 | 11.4k | vsrc1 = _mm_madd_epi16 ( vit, vscale ); | 297 | 11.4k | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 298 | | | 299 | 11.4k | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 300 | 11.4k | } | 301 | 11.4k | } | 302 | 2.93k | } | 303 | 1.19k | } | 304 | 690 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | THROW_FATAL( "Unsupported size" ); | 308 | 0 | } | 309 | 690 | #if USE_AVX2 | 310 | | | 311 | 690 | _mm256_zeroupper(); | 312 | 690 | #endif | 313 | 690 | } |
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Line | Count | Source | 67 | 6.82k | { | 68 | 6.82k | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); | 69 | | | 70 | 6.82k | #if USE_AVX2 | 71 | 6.82k | if( trSize >= 8 && vext >= AVX2 ) | 72 | 6.82k | { | 73 | 6.82k | if( ( trSize & 15 ) == 0 ) | 74 | 0 | { | 75 | 0 | static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1; | 76 | |
| 77 | 0 | for( int k = 0; k < rows; k += 2 ) | 78 | 0 | { | 79 | 0 | TCoeff* dstPtr = dst; | 80 | |
| 81 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 82 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 83 | |
| 84 | 0 | __m256i vsrc1v[trLoops][2]; | 85 | | | 86 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 87 | 0 | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; | 88 | |
| 89 | 0 | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) | 90 | 0 | { | 91 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 92 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 93 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 94 | | #else | 95 | 0 | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 96 | 0 | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 97 | 0 | #endif | 98 | |
| 99 | 0 | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); | 100 | 0 | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); | 101 | 0 | } | 102 | |
| 103 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 104 | 0 | { | 105 | 0 | __m128i xscale = maxLoopL == 4 | 106 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 107 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 108 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 109 | |
| 110 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 111 | | | 112 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 113 | 0 | { | 114 | 0 | __m256i | 115 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); | 116 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 117 | |
| 118 | 0 | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) | 119 | 0 | { | 120 | 0 | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 121 | |
| 122 | 0 | __m256i | 123 | 0 | vsrc1 = vsrc1v[col][0]; | 124 | 0 | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 125 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 126 | |
| 127 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 128 | | | 129 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); | 130 | |
| 131 | 0 | vsrc1 = vsrc1v[col][1]; | 132 | 0 | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 133 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 134 | |
| 135 | 0 | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); | 136 | 0 | } | 137 | 0 | } | 138 | 0 | } | 139 | 0 | } | 140 | 0 | } | 141 | 6.82k | else | 142 | 6.82k | { | 143 | 26.8k | for( int k = 0; k < rows; k += 2 ) | 144 | 19.9k | { | 145 | 19.9k | TCoeff* dstPtr = dst; | 146 | | | 147 | 19.9k | const TCoeff* srcPtr0 = &src[ k * lines]; | 148 | 19.9k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 149 | | | 150 | 19.9k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 151 | 19.9k | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 152 | | | 153 | 19.9k | __m256i vit; | 154 | | | 155 | 19.9k | { | 156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 157 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 158 | | #else | 159 | 19.9k | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 160 | 19.9k | #endif | 161 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 162 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 163 | | #else | 164 | 19.9k | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 165 | 19.9k | #endif | 166 | | | 167 | 19.9k | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); | 168 | 19.9k | } | 169 | | | 170 | 67.9k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 171 | 48.0k | { | 172 | 48.0k | __m128i xscale = maxLoopL == 4 | 173 | 48.0k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 174 | 48.0k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 175 | 48.0k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 176 | | | 177 | 48.0k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 178 | | | 179 | 197k | for( int l = 0; l < maxLoopL; l++ ) | 180 | 157k | { | 181 | 157k | __m256i | 182 | 157k | vscale = _mm256_broadcastd_epi32( xscale ); | 183 | 157k | xscale = _mm_bsrli_si128( xscale, 4 ); | 184 | | | 185 | 315k | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 186 | 157k | { | 187 | 157k | __m256i | 188 | 157k | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 189 | 157k | __m256i | 190 | 157k | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); | 191 | 157k | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 192 | | | 193 | 157k | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 194 | 157k | } | 195 | 157k | } | 196 | 39.6k | } | 197 | 19.9k | } | 198 | 6.82k | } | 199 | 6.82k | } | 200 | | #else | 201 | | if( trSize >= 8 ) | 202 | | { | 203 | | for( int k = 0; k < rows; k += 2 ) | 204 | | { | 205 | | TCoeff* dstPtr = dst; | 206 | | | 207 | | const TCoeff* srcPtr0 = &src[ k * lines]; | 208 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 209 | | | 210 | | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 211 | | { | 212 | | __m128i xscale = maxLoopL == 4 | 213 | | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 214 | | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 215 | | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 216 | | | 217 | | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 218 | | | 219 | | for( int l = 0; l < maxLoopL; l++ ) | 220 | | { | 221 | | const TMatrixCoeff* itPtr0 = &it[k * trSize]; | 222 | | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 223 | | | 224 | | __m128i | 225 | | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 226 | | xscale = _mm_bsrli_si128( xscale, 4 ); | 227 | | | 228 | | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 229 | | { | 230 | | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 231 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 232 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); | 233 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); | 234 | | #else | 235 | | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); | 236 | | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); | 237 | | #endif | 238 | | | 239 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); | 240 | | | 241 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 242 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 243 | | | 244 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 245 | | | 246 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); | 247 | | | 248 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); | 249 | | | 250 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 251 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 252 | | | 253 | | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); | 254 | | } | 255 | | } | 256 | | } | 257 | | } | 258 | | } | 259 | | #endif | 260 | 0 | else if( trSize >= 4 ) | 261 | 0 | { | 262 | 0 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); | 263 | |
| 264 | 0 | for( int k = 0; k < rows; k += 2 ) | 265 | 0 | { | 266 | 0 | TCoeff* dstPtr = dst; | 267 | |
| 268 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 269 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 270 | |
| 271 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 272 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 273 | |
| 274 | 0 | __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) ); | 275 | | | 276 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 277 | 0 | { | 278 | 0 | __m128i xscale = maxLoopL == 4 | 279 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 280 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 281 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 282 | |
| 283 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 284 | | | 285 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 286 | 0 | { | 287 | 0 | __m128i | 288 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 289 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 290 | |
| 291 | 0 | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) | 292 | 0 | { | 293 | 0 | __m128i | 294 | 0 | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 295 | 0 | __m128i | 296 | 0 | vsrc1 = _mm_madd_epi16 ( vit, vscale ); | 297 | 0 | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 298 | |
| 299 | 0 | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 300 | 0 | } | 301 | 0 | } | 302 | 0 | } | 303 | 0 | } | 304 | 0 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | THROW_FATAL( "Unsupported size" ); | 308 | 0 | } | 309 | 6.82k | #if USE_AVX2 | 310 | | | 311 | 6.82k | _mm256_zeroupper(); | 312 | 6.82k | #endif | 313 | 6.82k | } |
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Line | Count | Source | 67 | 9.32k | { | 68 | 9.32k | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); | 69 | | | 70 | 9.32k | #if USE_AVX2 | 71 | 9.32k | if( trSize >= 8 && vext >= AVX2 ) | 72 | 9.32k | { | 73 | 9.32k | if( ( trSize & 15 ) == 0 ) | 74 | 9.32k | { | 75 | 9.32k | static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1; | 76 | | | 77 | 38.7k | for( int k = 0; k < rows; k += 2 ) | 78 | 29.4k | { | 79 | 29.4k | TCoeff* dstPtr = dst; | 80 | | | 81 | 29.4k | const TCoeff* srcPtr0 = &src[ k * lines]; | 82 | 29.4k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 83 | | | 84 | 29.4k | __m256i vsrc1v[trLoops][2]; | 85 | | | 86 | 29.4k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 87 | 29.4k | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; | 88 | | | 89 | 58.8k | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) | 90 | 29.4k | { | 91 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 92 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 93 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 94 | | #else | 95 | 29.4k | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 96 | 29.4k | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 97 | 29.4k | #endif | 98 | | | 99 | 29.4k | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); | 100 | 29.4k | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); | 101 | 29.4k | } | 102 | | | 103 | 113k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 104 | 83.7k | { | 105 | 83.7k | __m128i xscale = maxLoopL == 4 | 106 | 83.7k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 107 | 83.7k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 108 | 83.7k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 109 | | | 110 | 83.7k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 111 | | | 112 | 342k | for( int l = 0; l < maxLoopL; l++ ) | 113 | 273k | { | 114 | 273k | __m256i | 115 | 273k | vscale = _mm256_broadcastd_epi32( xscale ); | 116 | 273k | xscale = _mm_bsrli_si128( xscale, 4 ); | 117 | | | 118 | 547k | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) | 119 | 273k | { | 120 | 273k | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 121 | | | 122 | 273k | __m256i | 123 | 273k | vsrc1 = vsrc1v[col][0]; | 124 | 273k | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 125 | 273k | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 126 | | | 127 | 273k | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 128 | | | 129 | 273k | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); | 130 | | | 131 | 273k | vsrc1 = vsrc1v[col][1]; | 132 | 273k | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 133 | 273k | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 134 | | | 135 | 273k | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); | 136 | 273k | } | 137 | 273k | } | 138 | 68.7k | } | 139 | 29.4k | } | 140 | 9.32k | } | 141 | 0 | else | 142 | 0 | { | 143 | 0 | for( int k = 0; k < rows; k += 2 ) | 144 | 0 | { | 145 | 0 | TCoeff* dstPtr = dst; | 146 | |
| 147 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 148 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 149 | |
| 150 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 151 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 152 | |
| 153 | 0 | __m256i vit; | 154 | |
| 155 | 0 | { | 156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 157 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 158 | | #else | 159 | 0 | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 160 | 0 | #endif | 161 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 162 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 163 | | #else | 164 | 0 | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 165 | 0 | #endif | 166 | |
| 167 | 0 | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); | 168 | 0 | } | 169 | | | 170 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 171 | 0 | { | 172 | 0 | __m128i xscale = maxLoopL == 4 | 173 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 174 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 175 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 176 | |
| 177 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 178 | | | 179 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 180 | 0 | { | 181 | 0 | __m256i | 182 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); | 183 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 184 | |
| 185 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 186 | 0 | { | 187 | 0 | __m256i | 188 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 189 | 0 | __m256i | 190 | 0 | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); | 191 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 192 | |
| 193 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 194 | 0 | } | 195 | 0 | } | 196 | 0 | } | 197 | 0 | } | 198 | 0 | } | 199 | 9.32k | } | 200 | | #else | 201 | | if( trSize >= 8 ) | 202 | | { | 203 | | for( int k = 0; k < rows; k += 2 ) | 204 | | { | 205 | | TCoeff* dstPtr = dst; | 206 | | | 207 | | const TCoeff* srcPtr0 = &src[ k * lines]; | 208 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 209 | | | 210 | | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 211 | | { | 212 | | __m128i xscale = maxLoopL == 4 | 213 | | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 214 | | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 215 | | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 216 | | | 217 | | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 218 | | | 219 | | for( int l = 0; l < maxLoopL; l++ ) | 220 | | { | 221 | | const TMatrixCoeff* itPtr0 = &it[k * trSize]; | 222 | | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 223 | | | 224 | | __m128i | 225 | | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 226 | | xscale = _mm_bsrli_si128( xscale, 4 ); | 227 | | | 228 | | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 229 | | { | 230 | | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 231 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 232 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); | 233 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); | 234 | | #else | 235 | | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); | 236 | | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); | 237 | | #endif | 238 | | | 239 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); | 240 | | | 241 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 242 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 243 | | | 244 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 245 | | | 246 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); | 247 | | | 248 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); | 249 | | | 250 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 251 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 252 | | | 253 | | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); | 254 | | } | 255 | | } | 256 | | } | 257 | | } | 258 | | } | 259 | | #endif | 260 | 0 | else if( trSize >= 4 ) | 261 | 0 | { | 262 | 0 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); | 263 | |
| 264 | 0 | for( int k = 0; k < rows; k += 2 ) | 265 | 0 | { | 266 | 0 | TCoeff* dstPtr = dst; | 267 | |
| 268 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 269 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 270 | |
| 271 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 272 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 273 | |
| 274 | 0 | __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) ); | 275 | | | 276 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 277 | 0 | { | 278 | 0 | __m128i xscale = maxLoopL == 4 | 279 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 280 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 281 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 282 | |
| 283 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 284 | | | 285 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 286 | 0 | { | 287 | 0 | __m128i | 288 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 289 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 290 | |
| 291 | 0 | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) | 292 | 0 | { | 293 | 0 | __m128i | 294 | 0 | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 295 | 0 | __m128i | 296 | 0 | vsrc1 = _mm_madd_epi16 ( vit, vscale ); | 297 | 0 | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 298 | |
| 299 | 0 | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 300 | 0 | } | 301 | 0 | } | 302 | 0 | } | 303 | 0 | } | 304 | 0 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | THROW_FATAL( "Unsupported size" ); | 308 | 0 | } | 309 | 9.32k | #if USE_AVX2 | 310 | | | 311 | 9.32k | _mm256_zeroupper(); | 312 | 9.32k | #endif | 313 | 9.32k | } |
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Line | Count | Source | 67 | 18.0k | { | 68 | 18.0k | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); | 69 | | | 70 | 18.0k | #if USE_AVX2 | 71 | 18.0k | if( trSize >= 8 && vext >= AVX2 ) | 72 | 18.0k | { | 73 | 18.0k | if( ( trSize & 15 ) == 0 ) | 74 | 18.0k | { | 75 | 18.0k | static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1; | 76 | | | 77 | 85.6k | for( int k = 0; k < rows; k += 2 ) | 78 | 67.6k | { | 79 | 67.6k | TCoeff* dstPtr = dst; | 80 | | | 81 | 67.6k | const TCoeff* srcPtr0 = &src[ k * lines]; | 82 | 67.6k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 83 | | | 84 | 67.6k | __m256i vsrc1v[trLoops][2]; | 85 | | | 86 | 67.6k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 87 | 67.6k | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; | 88 | | | 89 | 202k | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) | 90 | 135k | { | 91 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 92 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 93 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 94 | | #else | 95 | 135k | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 96 | 135k | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 97 | 135k | #endif | 98 | | | 99 | 135k | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); | 100 | 135k | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); | 101 | 135k | } | 102 | | | 103 | 318k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 104 | 250k | { | 105 | 250k | __m128i xscale = maxLoopL == 4 | 106 | 250k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 107 | 250k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 108 | 250k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 109 | | | 110 | 250k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 111 | | | 112 | 1.04M | for( int l = 0; l < maxLoopL; l++ ) | 113 | 834k | { | 114 | 834k | __m256i | 115 | 834k | vscale = _mm256_broadcastd_epi32( xscale ); | 116 | 834k | xscale = _mm_bsrli_si128( xscale, 4 ); | 117 | | | 118 | 2.50M | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) | 119 | 1.66M | { | 120 | 1.66M | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 121 | | | 122 | 1.66M | __m256i | 123 | 1.66M | vsrc1 = vsrc1v[col][0]; | 124 | 1.66M | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 125 | 1.66M | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 126 | | | 127 | 1.66M | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 128 | | | 129 | 1.66M | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); | 130 | | | 131 | 1.66M | vsrc1 = vsrc1v[col][1]; | 132 | 1.66M | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 133 | 1.66M | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 134 | | | 135 | 1.66M | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); | 136 | 1.66M | } | 137 | 834k | } | 138 | 209k | } | 139 | 67.6k | } | 140 | 18.0k | } | 141 | 0 | else | 142 | 0 | { | 143 | 0 | for( int k = 0; k < rows; k += 2 ) | 144 | 0 | { | 145 | 0 | TCoeff* dstPtr = dst; | 146 | |
| 147 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 148 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 149 | |
| 150 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 151 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 152 | |
| 153 | 0 | __m256i vit; | 154 | |
| 155 | 0 | { | 156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 157 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 158 | | #else | 159 | 0 | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 160 | 0 | #endif | 161 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 162 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 163 | | #else | 164 | 0 | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 165 | 0 | #endif | 166 | |
| 167 | 0 | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); | 168 | 0 | } | 169 | | | 170 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 171 | 0 | { | 172 | 0 | __m128i xscale = maxLoopL == 4 | 173 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 174 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 175 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 176 | |
| 177 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 178 | | | 179 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 180 | 0 | { | 181 | 0 | __m256i | 182 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); | 183 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 184 | |
| 185 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 186 | 0 | { | 187 | 0 | __m256i | 188 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 189 | 0 | __m256i | 190 | 0 | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); | 191 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 192 | |
| 193 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 194 | 0 | } | 195 | 0 | } | 196 | 0 | } | 197 | 0 | } | 198 | 0 | } | 199 | 18.0k | } | 200 | | #else | 201 | | if( trSize >= 8 ) | 202 | | { | 203 | | for( int k = 0; k < rows; k += 2 ) | 204 | | { | 205 | | TCoeff* dstPtr = dst; | 206 | | | 207 | | const TCoeff* srcPtr0 = &src[ k * lines]; | 208 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 209 | | | 210 | | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 211 | | { | 212 | | __m128i xscale = maxLoopL == 4 | 213 | | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 214 | | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 215 | | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 216 | | | 217 | | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 218 | | | 219 | | for( int l = 0; l < maxLoopL; l++ ) | 220 | | { | 221 | | const TMatrixCoeff* itPtr0 = &it[k * trSize]; | 222 | | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 223 | | | 224 | | __m128i | 225 | | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 226 | | xscale = _mm_bsrli_si128( xscale, 4 ); | 227 | | | 228 | | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 229 | | { | 230 | | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 231 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 232 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); | 233 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); | 234 | | #else | 235 | | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); | 236 | | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); | 237 | | #endif | 238 | | | 239 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); | 240 | | | 241 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 242 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 243 | | | 244 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 245 | | | 246 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); | 247 | | | 248 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); | 249 | | | 250 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 251 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 252 | | | 253 | | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); | 254 | | } | 255 | | } | 256 | | } | 257 | | } | 258 | | } | 259 | | #endif | 260 | 0 | else if( trSize >= 4 ) | 261 | 0 | { | 262 | 0 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); | 263 | |
| 264 | 0 | for( int k = 0; k < rows; k += 2 ) | 265 | 0 | { | 266 | 0 | TCoeff* dstPtr = dst; | 267 | |
| 268 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 269 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 270 | |
| 271 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 272 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 273 | |
| 274 | 0 | __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) ); | 275 | | | 276 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 277 | 0 | { | 278 | 0 | __m128i xscale = maxLoopL == 4 | 279 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 280 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 281 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 282 | |
| 283 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 284 | | | 285 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 286 | 0 | { | 287 | 0 | __m128i | 288 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 289 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 290 | |
| 291 | 0 | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) | 292 | 0 | { | 293 | 0 | __m128i | 294 | 0 | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 295 | 0 | __m128i | 296 | 0 | vsrc1 = _mm_madd_epi16 ( vit, vscale ); | 297 | 0 | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 298 | |
| 299 | 0 | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 300 | 0 | } | 301 | 0 | } | 302 | 0 | } | 303 | 0 | } | 304 | 0 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | THROW_FATAL( "Unsupported size" ); | 308 | 0 | } | 309 | 18.0k | #if USE_AVX2 | 310 | | | 311 | 18.0k | _mm256_zeroupper(); | 312 | 18.0k | #endif | 313 | 18.0k | } |
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Line | Count | Source | 67 | 2.58k | { | 68 | 2.58k | unsigned maxLoopL = std::min<int>( reducedLines, 4 ); | 69 | | | 70 | 2.58k | #if USE_AVX2 | 71 | 2.58k | if( trSize >= 8 && vext >= AVX2 ) | 72 | 2.58k | { | 73 | 2.58k | if( ( trSize & 15 ) == 0 ) | 74 | 2.58k | { | 75 | 2.58k | static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1; | 76 | | | 77 | 11.7k | for( int k = 0; k < rows; k += 2 ) | 78 | 9.17k | { | 79 | 9.17k | TCoeff* dstPtr = dst; | 80 | | | 81 | 9.17k | const TCoeff* srcPtr0 = &src[ k * lines]; | 82 | 9.17k | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 83 | | | 84 | 9.17k | __m256i vsrc1v[trLoops][2]; | 85 | | | 86 | 9.17k | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 87 | 9.17k | const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize]; | 88 | | | 89 | 45.8k | for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 ) | 90 | 36.6k | { | 91 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 92 | | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 93 | | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 94 | | #else | 95 | 36.6k | __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 96 | 36.6k | __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 97 | 36.6k | #endif | 98 | | | 99 | 36.6k | vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 ); | 100 | 36.6k | vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 ); | 101 | 36.6k | } | 102 | | | 103 | 92.5k | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 104 | 83.4k | { | 105 | 83.4k | __m128i xscale = maxLoopL == 4 | 106 | 83.4k | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 107 | 83.4k | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 108 | 83.4k | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 109 | | | 110 | 83.4k | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 111 | | | 112 | 357k | for( int l = 0; l < maxLoopL; l++ ) | 113 | 285k | { | 114 | 285k | __m256i | 115 | 285k | vscale = _mm256_broadcastd_epi32( xscale ); | 116 | 285k | xscale = _mm_bsrli_si128( xscale, 4 ); | 117 | | | 118 | 1.42M | for( int col = 0; col < trLoops; col++, dstPtr += 16 ) | 119 | 1.14M | { | 120 | 1.14M | __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 121 | | | 122 | 1.14M | __m256i | 123 | 1.14M | vsrc1 = vsrc1v[col][0]; | 124 | 1.14M | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 125 | 1.14M | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 126 | | | 127 | 1.14M | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 128 | | | 129 | 1.14M | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] ); | 130 | | | 131 | 1.14M | vsrc1 = vsrc1v[col][1]; | 132 | 1.14M | vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale ); | 133 | 1.14M | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 134 | | | 135 | 1.14M | _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 ); | 136 | 1.14M | } | 137 | 285k | } | 138 | 71.5k | } | 139 | 9.17k | } | 140 | 2.58k | } | 141 | 0 | else | 142 | 0 | { | 143 | 0 | for( int k = 0; k < rows; k += 2 ) | 144 | 0 | { | 145 | 0 | TCoeff* dstPtr = dst; | 146 | |
| 147 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 148 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 149 | |
| 150 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 151 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 152 | |
| 153 | 0 | __m256i vit; | 154 | |
| 155 | 0 | { | 156 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 157 | | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 158 | | #else | 159 | 0 | __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 160 | 0 | #endif | 161 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 162 | | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 163 | | #else | 164 | 0 | __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) ); | 165 | 0 | #endif | 166 | |
| 167 | 0 | vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 ); | 168 | 0 | } | 169 | | | 170 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 171 | 0 | { | 172 | 0 | __m128i xscale = maxLoopL == 4 | 173 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 174 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 175 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 176 | |
| 177 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 178 | | | 179 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 180 | 0 | { | 181 | 0 | __m256i | 182 | 0 | vscale = _mm256_broadcastd_epi32( xscale ); | 183 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 184 | |
| 185 | 0 | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 186 | 0 | { | 187 | 0 | __m256i | 188 | 0 | vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr ); | 189 | 0 | __m256i | 190 | 0 | vsrc1 = _mm256_madd_epi16 ( vit, vscale ); | 191 | 0 | vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 ); | 192 | |
| 193 | 0 | _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 ); | 194 | 0 | } | 195 | 0 | } | 196 | 0 | } | 197 | 0 | } | 198 | 0 | } | 199 | 2.58k | } | 200 | | #else | 201 | | if( trSize >= 8 ) | 202 | | { | 203 | | for( int k = 0; k < rows; k += 2 ) | 204 | | { | 205 | | TCoeff* dstPtr = dst; | 206 | | | 207 | | const TCoeff* srcPtr0 = &src[ k * lines]; | 208 | | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 209 | | | 210 | | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 211 | | { | 212 | | __m128i xscale = maxLoopL == 4 | 213 | | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 214 | | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 215 | | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 216 | | | 217 | | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 218 | | | 219 | | for( int l = 0; l < maxLoopL; l++ ) | 220 | | { | 221 | | const TMatrixCoeff* itPtr0 = &it[k * trSize]; | 222 | | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 223 | | | 224 | | __m128i | 225 | | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 226 | | xscale = _mm_bsrli_si128( xscale, 4 ); | 227 | | | 228 | | for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 ) | 229 | | { | 230 | | __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 231 | | #if defined( _MSC_VER ) && _MSC_VER > 1900 | 232 | | __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 ); | 233 | | __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 ); | 234 | | #else | 235 | | __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 ); | 236 | | __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 ); | 237 | | #endif | 238 | | | 239 | | __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 ); | 240 | | | 241 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 242 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 243 | | | 244 | | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 245 | | | 246 | | vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] ); | 247 | | | 248 | | vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 ); | 249 | | | 250 | | vsrc1 = _mm_madd_epi16 ( vsrc1, vscale ); | 251 | | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 252 | | | 253 | | _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 ); | 254 | | } | 255 | | } | 256 | | } | 257 | | } | 258 | | } | 259 | | #endif | 260 | 0 | else if( trSize >= 4 ) | 261 | 0 | { | 262 | 0 | CHECKD( trSize != 4, "trSize needs to be '4'!" ); | 263 | |
| 264 | 0 | for( int k = 0; k < rows; k += 2 ) | 265 | 0 | { | 266 | 0 | TCoeff* dstPtr = dst; | 267 | |
| 268 | 0 | const TCoeff* srcPtr0 = &src[ k * lines]; | 269 | 0 | const TCoeff* srcPtr1 = &src[(k + 1) * lines]; | 270 | |
| 271 | 0 | const TMatrixCoeff* itPtr0 = &it[ k * trSize]; | 272 | 0 | const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize]; | 273 | |
| 274 | 0 | __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) ); | 275 | | | 276 | 0 | for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL ) | 277 | 0 | { | 278 | 0 | __m128i xscale = maxLoopL == 4 | 279 | 0 | ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) ) | 280 | 0 | : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) ); | 281 | 0 | xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) ); | 282 | |
| 283 | 0 | if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; } | 284 | | | 285 | 0 | for( int l = 0; l < maxLoopL; l++ ) | 286 | 0 | { | 287 | 0 | __m128i | 288 | 0 | vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) ); | 289 | 0 | xscale = _mm_bsrli_si128( xscale, 4 ); | 290 | |
| 291 | 0 | for( int col = 0; col < trSize; col += 4, dstPtr += 4 ) | 292 | 0 | { | 293 | 0 | __m128i | 294 | 0 | vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr ); | 295 | 0 | __m128i | 296 | 0 | vsrc1 = _mm_madd_epi16 ( vit, vscale ); | 297 | 0 | vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 ); | 298 | |
| 299 | 0 | _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 ); | 300 | 0 | } | 301 | 0 | } | 302 | 0 | } | 303 | 0 | } | 304 | 0 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | THROW_FATAL( "Unsupported size" ); | 308 | 0 | } | 309 | 2.58k | #if USE_AVX2 | 310 | | | 311 | 2.58k | _mm256_zeroupper(); | 312 | 2.58k | #endif | 313 | 2.58k | } |
|
314 | | |
315 | | template< X86_VEXT vext, int W > |
316 | | void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift ) |
317 | 19.9k | { |
318 | | #if USE_AVX2 |
319 | 19.9k | if( W >= 8 && vext >= AVX2 ) |
320 | 18.0k | { |
321 | 18.0k | __m256i vmin = _mm256_set1_epi32( outputMin ); |
322 | 18.0k | __m256i vmax = _mm256_set1_epi32( outputMax ); |
323 | 18.0k | __m256i vrnd = _mm256_set1_epi32( round ); |
324 | | |
325 | 127k | while( height-- ) |
326 | 109k | { |
327 | 468k | for( int col = 0; col < width; col += 8 ) |
328 | 359k | { |
329 | 359k | __m256i |
330 | 359k | vdst = _mm256_load_si256( ( __m256i * ) &dst[col] ); |
331 | 359k | vdst = _mm256_add_epi32 ( vdst, vrnd ); |
332 | 359k | vdst = _mm256_srai_epi32( vdst, shift ); |
333 | 359k | vdst = _mm256_max_epi32 ( vdst, vmin ); |
334 | 359k | vdst = _mm256_min_epi32 ( vdst, vmax ); |
335 | 359k | _mm256_store_si256 ( ( __m256i * ) &dst[col], vdst ); |
336 | 359k | } |
337 | | |
338 | 109k | dst += stride; |
339 | 109k | } |
340 | 18.0k | } |
341 | 1.94k | else |
342 | 1.94k | #endif |
343 | 1.94k | if( W >= 4 ) |
344 | 1.94k | { |
345 | 1.94k | __m128i vmin = _mm_set1_epi32( outputMin ); |
346 | 1.94k | __m128i vmax = _mm_set1_epi32( outputMax ); |
347 | 1.94k | __m128i vrnd = _mm_set1_epi32( round ); |
348 | | |
349 | 13.2k | while( height-- ) |
350 | 11.3k | { |
351 | 22.6k | for( int col = 0; col < width; col += 4 ) |
352 | 11.3k | { |
353 | 11.3k | __m128i |
354 | 11.3k | vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] ); |
355 | 11.3k | vdst = _mm_add_epi32 ( vdst, vrnd ); |
356 | 11.3k | vdst = _mm_srai_epi32 ( vdst, shift ); |
357 | 11.3k | vdst = _mm_max_epi32 ( vdst, vmin ); |
358 | 11.3k | vdst = _mm_min_epi32 ( vdst, vmax ); |
359 | 11.3k | _mm_store_si128 ( ( __m128i * ) &dst[col], vdst ); |
360 | 11.3k | } |
361 | | |
362 | 11.3k | dst += stride; |
363 | 11.3k | } |
364 | 1.94k | } |
365 | 0 | else |
366 | 0 | { |
367 | 0 | THROW_FATAL( "Unsupported size" ); |
368 | 0 | } |
369 | | #if USE_AVX2 |
370 | | |
371 | 19.9k | _mm256_zeroupper(); |
372 | 19.9k | #endif |
373 | 19.9k | } Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 317 | 1.94k | { | 318 | 1.94k | #if USE_AVX2 | 319 | 1.94k | if( W >= 8 && vext >= AVX2 ) | 320 | 0 | { | 321 | 0 | __m256i vmin = _mm256_set1_epi32( outputMin ); | 322 | 0 | __m256i vmax = _mm256_set1_epi32( outputMax ); | 323 | 0 | __m256i vrnd = _mm256_set1_epi32( round ); | 324 | |
| 325 | 0 | while( height-- ) | 326 | 0 | { | 327 | 0 | for( int col = 0; col < width; col += 8 ) | 328 | 0 | { | 329 | 0 | __m256i | 330 | 0 | vdst = _mm256_load_si256( ( __m256i * ) &dst[col] ); | 331 | 0 | vdst = _mm256_add_epi32 ( vdst, vrnd ); | 332 | 0 | vdst = _mm256_srai_epi32( vdst, shift ); | 333 | 0 | vdst = _mm256_max_epi32 ( vdst, vmin ); | 334 | 0 | vdst = _mm256_min_epi32 ( vdst, vmax ); | 335 | 0 | _mm256_store_si256 ( ( __m256i * ) &dst[col], vdst ); | 336 | 0 | } | 337 | |
| 338 | 0 | dst += stride; | 339 | 0 | } | 340 | 0 | } | 341 | 1.94k | else | 342 | 1.94k | #endif | 343 | 1.94k | if( W >= 4 ) | 344 | 1.94k | { | 345 | 1.94k | __m128i vmin = _mm_set1_epi32( outputMin ); | 346 | 1.94k | __m128i vmax = _mm_set1_epi32( outputMax ); | 347 | 1.94k | __m128i vrnd = _mm_set1_epi32( round ); | 348 | | | 349 | 13.2k | while( height-- ) | 350 | 11.3k | { | 351 | 22.6k | for( int col = 0; col < width; col += 4 ) | 352 | 11.3k | { | 353 | 11.3k | __m128i | 354 | 11.3k | vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] ); | 355 | 11.3k | vdst = _mm_add_epi32 ( vdst, vrnd ); | 356 | 11.3k | vdst = _mm_srai_epi32 ( vdst, shift ); | 357 | 11.3k | vdst = _mm_max_epi32 ( vdst, vmin ); | 358 | 11.3k | vdst = _mm_min_epi32 ( vdst, vmax ); | 359 | 11.3k | _mm_store_si128 ( ( __m128i * ) &dst[col], vdst ); | 360 | 11.3k | } | 361 | | | 362 | 11.3k | dst += stride; | 363 | 11.3k | } | 364 | 1.94k | } | 365 | 0 | else | 366 | 0 | { | 367 | 0 | THROW_FATAL( "Unsupported size" ); | 368 | 0 | } | 369 | 1.94k | #if USE_AVX2 | 370 | | | 371 | 1.94k | _mm256_zeroupper(); | 372 | 1.94k | #endif | 373 | 1.94k | } |
void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 317 | 18.0k | { | 318 | 18.0k | #if USE_AVX2 | 319 | 18.0k | if( W >= 8 && vext >= AVX2 ) | 320 | 18.0k | { | 321 | 18.0k | __m256i vmin = _mm256_set1_epi32( outputMin ); | 322 | 18.0k | __m256i vmax = _mm256_set1_epi32( outputMax ); | 323 | 18.0k | __m256i vrnd = _mm256_set1_epi32( round ); | 324 | | | 325 | 127k | while( height-- ) | 326 | 109k | { | 327 | 468k | for( int col = 0; col < width; col += 8 ) | 328 | 359k | { | 329 | 359k | __m256i | 330 | 359k | vdst = _mm256_load_si256( ( __m256i * ) &dst[col] ); | 331 | 359k | vdst = _mm256_add_epi32 ( vdst, vrnd ); | 332 | 359k | vdst = _mm256_srai_epi32( vdst, shift ); | 333 | 359k | vdst = _mm256_max_epi32 ( vdst, vmin ); | 334 | 359k | vdst = _mm256_min_epi32 ( vdst, vmax ); | 335 | 359k | _mm256_store_si256 ( ( __m256i * ) &dst[col], vdst ); | 336 | 359k | } | 337 | | | 338 | 109k | dst += stride; | 339 | 109k | } | 340 | 18.0k | } | 341 | 0 | else | 342 | 0 | #endif | 343 | 0 | if( W >= 4 ) | 344 | 0 | { | 345 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 346 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 347 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 348 | |
| 349 | 0 | while( height-- ) | 350 | 0 | { | 351 | 0 | for( int col = 0; col < width; col += 4 ) | 352 | 0 | { | 353 | 0 | __m128i | 354 | 0 | vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] ); | 355 | 0 | vdst = _mm_add_epi32 ( vdst, vrnd ); | 356 | 0 | vdst = _mm_srai_epi32 ( vdst, shift ); | 357 | 0 | vdst = _mm_max_epi32 ( vdst, vmin ); | 358 | 0 | vdst = _mm_min_epi32 ( vdst, vmax ); | 359 | 0 | _mm_store_si128 ( ( __m128i * ) &dst[col], vdst ); | 360 | 0 | } | 361 | |
| 362 | 0 | dst += stride; | 363 | 0 | } | 364 | 0 | } | 365 | 0 | else | 366 | 0 | { | 367 | 0 | THROW_FATAL( "Unsupported size" ); | 368 | 0 | } | 369 | 18.0k | #if USE_AVX2 | 370 | | | 371 | 18.0k | _mm256_zeroupper(); | 372 | 18.0k | #endif | 373 | 18.0k | } |
|
374 | | |
375 | | template< X86_VEXT vext, int W > |
376 | | void cpyResiClip_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift ) |
377 | 20.2k | { |
378 | | #if USE_AVX2 |
379 | 20.2k | if( W >= 16 ) |
380 | 15.5k | { |
381 | 15.5k | __m256i vmin = _mm256_set1_epi32( outputMin ); |
382 | 15.5k | __m256i vmax = _mm256_set1_epi32( outputMax ); |
383 | 15.5k | __m256i vrnd = _mm256_set1_epi32( round ); |
384 | | |
385 | 414k | while( height-- ) |
386 | 399k | { |
387 | 1.27M | for( int col = 0; col < width; col += 16 ) |
388 | 877k | { |
389 | 877k | __m256i |
390 | 877k | vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); |
391 | 877k | vsrc1 = _mm256_add_epi32 ( vsrc1, vrnd ); |
392 | 877k | vsrc1 = _mm256_srai_epi32 ( vsrc1, shift ); |
393 | 877k | vsrc1 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc1, vmax ), vmin ); |
394 | | |
395 | 877k | __m256i |
396 | 877k | vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] ); |
397 | 877k | vsrc2 = _mm256_add_epi32 ( vsrc2, vrnd ); |
398 | 877k | vsrc2 = _mm256_srai_epi32 ( vsrc2, shift ); |
399 | 877k | vsrc2 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc2, vmax ), vmin ); |
400 | | |
401 | 877k | __m256i |
402 | 877k | vdst = _mm256_packs_epi32( vsrc1, vsrc2 ); |
403 | 877k | vdst = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); |
404 | 877k | _mm256_storeu_si256 ( ( __m256i * ) &dst[col], vdst ); |
405 | 877k | } |
406 | | |
407 | 399k | src += width; |
408 | 399k | dst += stride; |
409 | 399k | } |
410 | 15.5k | } |
411 | 4.71k | else |
412 | 4.71k | #endif |
413 | 4.71k | if( W >= 8 ) |
414 | 3.21k | { |
415 | 3.21k | __m128i vmin = _mm_set1_epi32( outputMin ); |
416 | 3.21k | __m128i vmax = _mm_set1_epi32( outputMax ); |
417 | 3.21k | __m128i vrnd = _mm_set1_epi32( round ); |
418 | | |
419 | 46.8k | while( height-- ) |
420 | 43.6k | { |
421 | 87.3k | for( int col = 0; col < width; col += 8 ) |
422 | 43.6k | { |
423 | 43.6k | __m128i |
424 | 43.6k | vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] ); |
425 | 43.6k | vsrc1 = _mm_add_epi32 ( vsrc1, vrnd ); |
426 | 43.6k | vsrc1 = _mm_srai_epi32 ( vsrc1, shift ); |
427 | 43.6k | vsrc1 = _mm_max_epi32 ( _mm_min_epi32( vsrc1, vmax ), vmin ); |
428 | 43.6k | __m128i |
429 | 43.6k | vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] ); |
430 | 43.6k | vsrc2 = _mm_add_epi32 ( vsrc2, vrnd ); |
431 | 43.6k | vsrc2 = _mm_srai_epi32 ( vsrc2, shift ); |
432 | 43.6k | vsrc2 = _mm_max_epi32 ( _mm_min_epi32( vsrc2, vmax ), vmin ); |
433 | 43.6k | __m128i |
434 | 43.6k | vdst = _mm_packs_epi32( vsrc1, vsrc2 ); |
435 | 43.6k | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); |
436 | 43.6k | } |
437 | | |
438 | 43.6k | src += width; |
439 | 43.6k | dst += stride; |
440 | 43.6k | } |
441 | 3.21k | } |
442 | 1.49k | else if( W >= 4 ) |
443 | 1.49k | { |
444 | 1.49k | __m128i vmin = _mm_set1_epi32( outputMin ); |
445 | 1.49k | __m128i vmax = _mm_set1_epi32( outputMax ); |
446 | 1.49k | __m128i vrnd = _mm_set1_epi32( round ); |
447 | | |
448 | 1.49k | __m128i vzero = _mm_setzero_si128(); |
449 | 1.49k | __m128i vdst; |
450 | | |
451 | 19.8k | while( height-- ) |
452 | 18.3k | { |
453 | 36.7k | for( int col = 0; col < width; col += 4 ) |
454 | 18.3k | { |
455 | 18.3k | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); |
456 | 18.3k | vdst = _mm_add_epi32 ( vdst, vrnd ); |
457 | 18.3k | vdst = _mm_srai_epi32 ( vdst, shift ); |
458 | 18.3k | vdst = _mm_max_epi32 ( _mm_min_epi32( vdst, vmax ), vmin ); |
459 | 18.3k | vdst = _mm_packs_epi32( vdst, vzero ); |
460 | 18.3k | _mm_storeu_si64 ( ( __m128i * ) &dst[col], vdst ); |
461 | 18.3k | } |
462 | | |
463 | 18.3k | src += width; |
464 | 18.3k | dst += stride; |
465 | 18.3k | } |
466 | 1.49k | } |
467 | 0 | else |
468 | 0 | { |
469 | 0 | THROW_FATAL( "Unsupported size" ); |
470 | 0 | } |
471 | 20.2k | } Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 377 | 1.49k | { | 378 | 1.49k | #if USE_AVX2 | 379 | 1.49k | if( W >= 16 ) | 380 | 0 | { | 381 | 0 | __m256i vmin = _mm256_set1_epi32( outputMin ); | 382 | 0 | __m256i vmax = _mm256_set1_epi32( outputMax ); | 383 | 0 | __m256i vrnd = _mm256_set1_epi32( round ); | 384 | |
| 385 | 0 | while( height-- ) | 386 | 0 | { | 387 | 0 | for( int col = 0; col < width; col += 16 ) | 388 | 0 | { | 389 | 0 | __m256i | 390 | 0 | vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); | 391 | 0 | vsrc1 = _mm256_add_epi32 ( vsrc1, vrnd ); | 392 | 0 | vsrc1 = _mm256_srai_epi32 ( vsrc1, shift ); | 393 | 0 | vsrc1 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc1, vmax ), vmin ); | 394 | |
| 395 | 0 | __m256i | 396 | 0 | vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] ); | 397 | 0 | vsrc2 = _mm256_add_epi32 ( vsrc2, vrnd ); | 398 | 0 | vsrc2 = _mm256_srai_epi32 ( vsrc2, shift ); | 399 | 0 | vsrc2 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc2, vmax ), vmin ); | 400 | |
| 401 | 0 | __m256i | 402 | 0 | vdst = _mm256_packs_epi32( vsrc1, vsrc2 ); | 403 | 0 | vdst = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 404 | 0 | _mm256_storeu_si256 ( ( __m256i * ) &dst[col], vdst ); | 405 | 0 | } | 406 | |
| 407 | 0 | src += width; | 408 | 0 | dst += stride; | 409 | 0 | } | 410 | 0 | } | 411 | 1.49k | else | 412 | 1.49k | #endif | 413 | 1.49k | if( W >= 8 ) | 414 | 0 | { | 415 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 416 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 417 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 418 | |
| 419 | 0 | while( height-- ) | 420 | 0 | { | 421 | 0 | for( int col = 0; col < width; col += 8 ) | 422 | 0 | { | 423 | 0 | __m128i | 424 | 0 | vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 425 | 0 | vsrc1 = _mm_add_epi32 ( vsrc1, vrnd ); | 426 | 0 | vsrc1 = _mm_srai_epi32 ( vsrc1, shift ); | 427 | 0 | vsrc1 = _mm_max_epi32 ( _mm_min_epi32( vsrc1, vmax ), vmin ); | 428 | 0 | __m128i | 429 | 0 | vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] ); | 430 | 0 | vsrc2 = _mm_add_epi32 ( vsrc2, vrnd ); | 431 | 0 | vsrc2 = _mm_srai_epi32 ( vsrc2, shift ); | 432 | 0 | vsrc2 = _mm_max_epi32 ( _mm_min_epi32( vsrc2, vmax ), vmin ); | 433 | 0 | __m128i | 434 | 0 | vdst = _mm_packs_epi32( vsrc1, vsrc2 ); | 435 | 0 | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); | 436 | 0 | } | 437 | |
| 438 | 0 | src += width; | 439 | 0 | dst += stride; | 440 | 0 | } | 441 | 0 | } | 442 | 1.49k | else if( W >= 4 ) | 443 | 1.49k | { | 444 | 1.49k | __m128i vmin = _mm_set1_epi32( outputMin ); | 445 | 1.49k | __m128i vmax = _mm_set1_epi32( outputMax ); | 446 | 1.49k | __m128i vrnd = _mm_set1_epi32( round ); | 447 | | | 448 | 1.49k | __m128i vzero = _mm_setzero_si128(); | 449 | 1.49k | __m128i vdst; | 450 | | | 451 | 19.8k | while( height-- ) | 452 | 18.3k | { | 453 | 36.7k | for( int col = 0; col < width; col += 4 ) | 454 | 18.3k | { | 455 | 18.3k | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 456 | 18.3k | vdst = _mm_add_epi32 ( vdst, vrnd ); | 457 | 18.3k | vdst = _mm_srai_epi32 ( vdst, shift ); | 458 | 18.3k | vdst = _mm_max_epi32 ( _mm_min_epi32( vdst, vmax ), vmin ); | 459 | 18.3k | vdst = _mm_packs_epi32( vdst, vzero ); | 460 | 18.3k | _mm_storeu_si64 ( ( __m128i * ) &dst[col], vdst ); | 461 | 18.3k | } | 462 | | | 463 | 18.3k | src += width; | 464 | 18.3k | dst += stride; | 465 | 18.3k | } | 466 | 1.49k | } | 467 | 0 | else | 468 | 0 | { | 469 | 0 | THROW_FATAL( "Unsupported size" ); | 470 | 0 | } | 471 | 1.49k | } |
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 377 | 3.21k | { | 378 | 3.21k | #if USE_AVX2 | 379 | 3.21k | if( W >= 16 ) | 380 | 0 | { | 381 | 0 | __m256i vmin = _mm256_set1_epi32( outputMin ); | 382 | 0 | __m256i vmax = _mm256_set1_epi32( outputMax ); | 383 | 0 | __m256i vrnd = _mm256_set1_epi32( round ); | 384 | |
| 385 | 0 | while( height-- ) | 386 | 0 | { | 387 | 0 | for( int col = 0; col < width; col += 16 ) | 388 | 0 | { | 389 | 0 | __m256i | 390 | 0 | vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); | 391 | 0 | vsrc1 = _mm256_add_epi32 ( vsrc1, vrnd ); | 392 | 0 | vsrc1 = _mm256_srai_epi32 ( vsrc1, shift ); | 393 | 0 | vsrc1 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc1, vmax ), vmin ); | 394 | |
| 395 | 0 | __m256i | 396 | 0 | vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] ); | 397 | 0 | vsrc2 = _mm256_add_epi32 ( vsrc2, vrnd ); | 398 | 0 | vsrc2 = _mm256_srai_epi32 ( vsrc2, shift ); | 399 | 0 | vsrc2 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc2, vmax ), vmin ); | 400 | |
| 401 | 0 | __m256i | 402 | 0 | vdst = _mm256_packs_epi32( vsrc1, vsrc2 ); | 403 | 0 | vdst = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 404 | 0 | _mm256_storeu_si256 ( ( __m256i * ) &dst[col], vdst ); | 405 | 0 | } | 406 | |
| 407 | 0 | src += width; | 408 | 0 | dst += stride; | 409 | 0 | } | 410 | 0 | } | 411 | 3.21k | else | 412 | 3.21k | #endif | 413 | 3.21k | if( W >= 8 ) | 414 | 3.21k | { | 415 | 3.21k | __m128i vmin = _mm_set1_epi32( outputMin ); | 416 | 3.21k | __m128i vmax = _mm_set1_epi32( outputMax ); | 417 | 3.21k | __m128i vrnd = _mm_set1_epi32( round ); | 418 | | | 419 | 46.8k | while( height-- ) | 420 | 43.6k | { | 421 | 87.3k | for( int col = 0; col < width; col += 8 ) | 422 | 43.6k | { | 423 | 43.6k | __m128i | 424 | 43.6k | vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 425 | 43.6k | vsrc1 = _mm_add_epi32 ( vsrc1, vrnd ); | 426 | 43.6k | vsrc1 = _mm_srai_epi32 ( vsrc1, shift ); | 427 | 43.6k | vsrc1 = _mm_max_epi32 ( _mm_min_epi32( vsrc1, vmax ), vmin ); | 428 | 43.6k | __m128i | 429 | 43.6k | vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] ); | 430 | 43.6k | vsrc2 = _mm_add_epi32 ( vsrc2, vrnd ); | 431 | 43.6k | vsrc2 = _mm_srai_epi32 ( vsrc2, shift ); | 432 | 43.6k | vsrc2 = _mm_max_epi32 ( _mm_min_epi32( vsrc2, vmax ), vmin ); | 433 | 43.6k | __m128i | 434 | 43.6k | vdst = _mm_packs_epi32( vsrc1, vsrc2 ); | 435 | 43.6k | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); | 436 | 43.6k | } | 437 | | | 438 | 43.6k | src += width; | 439 | 43.6k | dst += stride; | 440 | 43.6k | } | 441 | 3.21k | } | 442 | 0 | else if( W >= 4 ) | 443 | 0 | { | 444 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 445 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 446 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 447 | |
| 448 | 0 | __m128i vzero = _mm_setzero_si128(); | 449 | 0 | __m128i vdst; | 450 | |
| 451 | 0 | while( height-- ) | 452 | 0 | { | 453 | 0 | for( int col = 0; col < width; col += 4 ) | 454 | 0 | { | 455 | 0 | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 456 | 0 | vdst = _mm_add_epi32 ( vdst, vrnd ); | 457 | 0 | vdst = _mm_srai_epi32 ( vdst, shift ); | 458 | 0 | vdst = _mm_max_epi32 ( _mm_min_epi32( vdst, vmax ), vmin ); | 459 | 0 | vdst = _mm_packs_epi32( vdst, vzero ); | 460 | 0 | _mm_storeu_si64 ( ( __m128i * ) &dst[col], vdst ); | 461 | 0 | } | 462 | |
| 463 | 0 | src += width; | 464 | 0 | dst += stride; | 465 | 0 | } | 466 | 0 | } | 467 | 0 | else | 468 | 0 | { | 469 | 0 | THROW_FATAL( "Unsupported size" ); | 470 | 0 | } | 471 | 3.21k | } |
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 377 | 4.74k | { | 378 | 4.74k | #if USE_AVX2 | 379 | 4.74k | if( W >= 16 ) | 380 | 4.74k | { | 381 | 4.74k | __m256i vmin = _mm256_set1_epi32( outputMin ); | 382 | 4.74k | __m256i vmax = _mm256_set1_epi32( outputMax ); | 383 | 4.74k | __m256i vrnd = _mm256_set1_epi32( round ); | 384 | | | 385 | 80.9k | while( height-- ) | 386 | 76.2k | { | 387 | 152k | for( int col = 0; col < width; col += 16 ) | 388 | 76.2k | { | 389 | 76.2k | __m256i | 390 | 76.2k | vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); | 391 | 76.2k | vsrc1 = _mm256_add_epi32 ( vsrc1, vrnd ); | 392 | 76.2k | vsrc1 = _mm256_srai_epi32 ( vsrc1, shift ); | 393 | 76.2k | vsrc1 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc1, vmax ), vmin ); | 394 | | | 395 | 76.2k | __m256i | 396 | 76.2k | vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] ); | 397 | 76.2k | vsrc2 = _mm256_add_epi32 ( vsrc2, vrnd ); | 398 | 76.2k | vsrc2 = _mm256_srai_epi32 ( vsrc2, shift ); | 399 | 76.2k | vsrc2 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc2, vmax ), vmin ); | 400 | | | 401 | 76.2k | __m256i | 402 | 76.2k | vdst = _mm256_packs_epi32( vsrc1, vsrc2 ); | 403 | 76.2k | vdst = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 404 | 76.2k | _mm256_storeu_si256 ( ( __m256i * ) &dst[col], vdst ); | 405 | 76.2k | } | 406 | | | 407 | 76.2k | src += width; | 408 | 76.2k | dst += stride; | 409 | 76.2k | } | 410 | 4.74k | } | 411 | 0 | else | 412 | 0 | #endif | 413 | 0 | if( W >= 8 ) | 414 | 0 | { | 415 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 416 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 417 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 418 | |
| 419 | 0 | while( height-- ) | 420 | 0 | { | 421 | 0 | for( int col = 0; col < width; col += 8 ) | 422 | 0 | { | 423 | 0 | __m128i | 424 | 0 | vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 425 | 0 | vsrc1 = _mm_add_epi32 ( vsrc1, vrnd ); | 426 | 0 | vsrc1 = _mm_srai_epi32 ( vsrc1, shift ); | 427 | 0 | vsrc1 = _mm_max_epi32 ( _mm_min_epi32( vsrc1, vmax ), vmin ); | 428 | 0 | __m128i | 429 | 0 | vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] ); | 430 | 0 | vsrc2 = _mm_add_epi32 ( vsrc2, vrnd ); | 431 | 0 | vsrc2 = _mm_srai_epi32 ( vsrc2, shift ); | 432 | 0 | vsrc2 = _mm_max_epi32 ( _mm_min_epi32( vsrc2, vmax ), vmin ); | 433 | 0 | __m128i | 434 | 0 | vdst = _mm_packs_epi32( vsrc1, vsrc2 ); | 435 | 0 | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); | 436 | 0 | } | 437 | |
| 438 | 0 | src += width; | 439 | 0 | dst += stride; | 440 | 0 | } | 441 | 0 | } | 442 | 0 | else if( W >= 4 ) | 443 | 0 | { | 444 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 445 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 446 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 447 | |
| 448 | 0 | __m128i vzero = _mm_setzero_si128(); | 449 | 0 | __m128i vdst; | 450 | |
| 451 | 0 | while( height-- ) | 452 | 0 | { | 453 | 0 | for( int col = 0; col < width; col += 4 ) | 454 | 0 | { | 455 | 0 | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 456 | 0 | vdst = _mm_add_epi32 ( vdst, vrnd ); | 457 | 0 | vdst = _mm_srai_epi32 ( vdst, shift ); | 458 | 0 | vdst = _mm_max_epi32 ( _mm_min_epi32( vdst, vmax ), vmin ); | 459 | 0 | vdst = _mm_packs_epi32( vdst, vzero ); | 460 | 0 | _mm_storeu_si64 ( ( __m128i * ) &dst[col], vdst ); | 461 | 0 | } | 462 | |
| 463 | 0 | src += width; | 464 | 0 | dst += stride; | 465 | 0 | } | 466 | 0 | } | 467 | 0 | else | 468 | 0 | { | 469 | 0 | THROW_FATAL( "Unsupported size" ); | 470 | 0 | } | 471 | 4.74k | } |
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 377 | 9.45k | { | 378 | 9.45k | #if USE_AVX2 | 379 | 9.45k | if( W >= 16 ) | 380 | 9.45k | { | 381 | 9.45k | __m256i vmin = _mm256_set1_epi32( outputMin ); | 382 | 9.45k | __m256i vmax = _mm256_set1_epi32( outputMax ); | 383 | 9.45k | __m256i vrnd = _mm256_set1_epi32( round ); | 384 | | | 385 | 254k | while( height-- ) | 386 | 245k | { | 387 | 735k | for( int col = 0; col < width; col += 16 ) | 388 | 490k | { | 389 | 490k | __m256i | 390 | 490k | vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); | 391 | 490k | vsrc1 = _mm256_add_epi32 ( vsrc1, vrnd ); | 392 | 490k | vsrc1 = _mm256_srai_epi32 ( vsrc1, shift ); | 393 | 490k | vsrc1 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc1, vmax ), vmin ); | 394 | | | 395 | 490k | __m256i | 396 | 490k | vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] ); | 397 | 490k | vsrc2 = _mm256_add_epi32 ( vsrc2, vrnd ); | 398 | 490k | vsrc2 = _mm256_srai_epi32 ( vsrc2, shift ); | 399 | 490k | vsrc2 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc2, vmax ), vmin ); | 400 | | | 401 | 490k | __m256i | 402 | 490k | vdst = _mm256_packs_epi32( vsrc1, vsrc2 ); | 403 | 490k | vdst = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 404 | 490k | _mm256_storeu_si256 ( ( __m256i * ) &dst[col], vdst ); | 405 | 490k | } | 406 | | | 407 | 245k | src += width; | 408 | 245k | dst += stride; | 409 | 245k | } | 410 | 9.45k | } | 411 | 0 | else | 412 | 0 | #endif | 413 | 0 | if( W >= 8 ) | 414 | 0 | { | 415 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 416 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 417 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 418 | |
| 419 | 0 | while( height-- ) | 420 | 0 | { | 421 | 0 | for( int col = 0; col < width; col += 8 ) | 422 | 0 | { | 423 | 0 | __m128i | 424 | 0 | vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 425 | 0 | vsrc1 = _mm_add_epi32 ( vsrc1, vrnd ); | 426 | 0 | vsrc1 = _mm_srai_epi32 ( vsrc1, shift ); | 427 | 0 | vsrc1 = _mm_max_epi32 ( _mm_min_epi32( vsrc1, vmax ), vmin ); | 428 | 0 | __m128i | 429 | 0 | vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] ); | 430 | 0 | vsrc2 = _mm_add_epi32 ( vsrc2, vrnd ); | 431 | 0 | vsrc2 = _mm_srai_epi32 ( vsrc2, shift ); | 432 | 0 | vsrc2 = _mm_max_epi32 ( _mm_min_epi32( vsrc2, vmax ), vmin ); | 433 | 0 | __m128i | 434 | 0 | vdst = _mm_packs_epi32( vsrc1, vsrc2 ); | 435 | 0 | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); | 436 | 0 | } | 437 | |
| 438 | 0 | src += width; | 439 | 0 | dst += stride; | 440 | 0 | } | 441 | 0 | } | 442 | 0 | else if( W >= 4 ) | 443 | 0 | { | 444 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 445 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 446 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 447 | |
| 448 | 0 | __m128i vzero = _mm_setzero_si128(); | 449 | 0 | __m128i vdst; | 450 | |
| 451 | 0 | while( height-- ) | 452 | 0 | { | 453 | 0 | for( int col = 0; col < width; col += 4 ) | 454 | 0 | { | 455 | 0 | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 456 | 0 | vdst = _mm_add_epi32 ( vdst, vrnd ); | 457 | 0 | vdst = _mm_srai_epi32 ( vdst, shift ); | 458 | 0 | vdst = _mm_max_epi32 ( _mm_min_epi32( vdst, vmax ), vmin ); | 459 | 0 | vdst = _mm_packs_epi32( vdst, vzero ); | 460 | 0 | _mm_storeu_si64 ( ( __m128i * ) &dst[col], vdst ); | 461 | 0 | } | 462 | |
| 463 | 0 | src += width; | 464 | 0 | dst += stride; | 465 | 0 | } | 466 | 0 | } | 467 | 0 | else | 468 | 0 | { | 469 | 0 | THROW_FATAL( "Unsupported size" ); | 470 | 0 | } | 471 | 9.45k | } |
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int) Line | Count | Source | 377 | 1.31k | { | 378 | 1.31k | #if USE_AVX2 | 379 | 1.31k | if( W >= 16 ) | 380 | 1.31k | { | 381 | 1.31k | __m256i vmin = _mm256_set1_epi32( outputMin ); | 382 | 1.31k | __m256i vmax = _mm256_set1_epi32( outputMax ); | 383 | 1.31k | __m256i vrnd = _mm256_set1_epi32( round ); | 384 | | | 385 | 78.9k | while( height-- ) | 386 | 77.6k | { | 387 | 388k | for( int col = 0; col < width; col += 16 ) | 388 | 310k | { | 389 | 310k | __m256i | 390 | 310k | vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] ); | 391 | 310k | vsrc1 = _mm256_add_epi32 ( vsrc1, vrnd ); | 392 | 310k | vsrc1 = _mm256_srai_epi32 ( vsrc1, shift ); | 393 | 310k | vsrc1 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc1, vmax ), vmin ); | 394 | | | 395 | 310k | __m256i | 396 | 310k | vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] ); | 397 | 310k | vsrc2 = _mm256_add_epi32 ( vsrc2, vrnd ); | 398 | 310k | vsrc2 = _mm256_srai_epi32 ( vsrc2, shift ); | 399 | 310k | vsrc2 = _mm256_max_epi32 ( _mm256_min_epi32( vsrc2, vmax ), vmin ); | 400 | | | 401 | 310k | __m256i | 402 | 310k | vdst = _mm256_packs_epi32( vsrc1, vsrc2 ); | 403 | 310k | vdst = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) ); | 404 | 310k | _mm256_storeu_si256 ( ( __m256i * ) &dst[col], vdst ); | 405 | 310k | } | 406 | | | 407 | 77.6k | src += width; | 408 | 77.6k | dst += stride; | 409 | 77.6k | } | 410 | 1.31k | } | 411 | 0 | else | 412 | 0 | #endif | 413 | 0 | if( W >= 8 ) | 414 | 0 | { | 415 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 416 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 417 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 418 | |
| 419 | 0 | while( height-- ) | 420 | 0 | { | 421 | 0 | for( int col = 0; col < width; col += 8 ) | 422 | 0 | { | 423 | 0 | __m128i | 424 | 0 | vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 425 | 0 | vsrc1 = _mm_add_epi32 ( vsrc1, vrnd ); | 426 | 0 | vsrc1 = _mm_srai_epi32 ( vsrc1, shift ); | 427 | 0 | vsrc1 = _mm_max_epi32 ( _mm_min_epi32( vsrc1, vmax ), vmin ); | 428 | 0 | __m128i | 429 | 0 | vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] ); | 430 | 0 | vsrc2 = _mm_add_epi32 ( vsrc2, vrnd ); | 431 | 0 | vsrc2 = _mm_srai_epi32 ( vsrc2, shift ); | 432 | 0 | vsrc2 = _mm_max_epi32 ( _mm_min_epi32( vsrc2, vmax ), vmin ); | 433 | 0 | __m128i | 434 | 0 | vdst = _mm_packs_epi32( vsrc1, vsrc2 ); | 435 | 0 | _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst ); | 436 | 0 | } | 437 | |
| 438 | 0 | src += width; | 439 | 0 | dst += stride; | 440 | 0 | } | 441 | 0 | } | 442 | 0 | else if( W >= 4 ) | 443 | 0 | { | 444 | 0 | __m128i vmin = _mm_set1_epi32( outputMin ); | 445 | 0 | __m128i vmax = _mm_set1_epi32( outputMax ); | 446 | 0 | __m128i vrnd = _mm_set1_epi32( round ); | 447 | |
| 448 | 0 | __m128i vzero = _mm_setzero_si128(); | 449 | 0 | __m128i vdst; | 450 | |
| 451 | 0 | while( height-- ) | 452 | 0 | { | 453 | 0 | for( int col = 0; col < width; col += 4 ) | 454 | 0 | { | 455 | 0 | vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] ); | 456 | 0 | vdst = _mm_add_epi32 ( vdst, vrnd ); | 457 | 0 | vdst = _mm_srai_epi32 ( vdst, shift ); | 458 | 0 | vdst = _mm_max_epi32 ( _mm_min_epi32( vdst, vmax ), vmin ); | 459 | 0 | vdst = _mm_packs_epi32( vdst, vzero ); | 460 | 0 | _mm_storeu_si64 ( ( __m128i * ) &dst[col], vdst ); | 461 | 0 | } | 462 | |
| 463 | 0 | src += width; | 464 | 0 | dst += stride; | 465 | 0 | } | 466 | 0 | } | 467 | 0 | else | 468 | 0 | { | 469 | 0 | THROW_FATAL( "Unsupported size" ); | 470 | 0 | } | 471 | 1.31k | } |
|
472 | | |
473 | | template<X86_VEXT vext> |
474 | | static void simdInvLfnstNxNCore( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ) |
475 | 5.29k | { |
476 | 5.29k | CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" ); |
477 | | |
478 | 5.29k | static constexpr int maxLog2TrDynamicRange = 15; |
479 | 5.29k | const TCoeff outputMinimum = -( 1 << maxLog2TrDynamicRange ); |
480 | 5.29k | const TCoeff outputMaximum = ( 1 << maxLog2TrDynamicRange ) - 1; |
481 | 5.29k | const int8_t* trMat = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0]; |
482 | 5.29k | const int trSize = ( size > 4 ) ? 48 : 16; |
483 | 5.29k | int* out = dst; |
484 | | |
485 | 5.29k | const __m128i vzero = _mm_setzero_si128(); |
486 | 5.29k | const __m128i vmin = _mm_set1_epi32( outputMinimum ); |
487 | 5.29k | const __m128i vmax = _mm_set1_epi32( outputMaximum ); |
488 | | |
489 | 58.2k | for( int j = 0; j < trSize; j += 4, out += 4 ) |
490 | 52.9k | { |
491 | 52.9k | __m128i vsum[4]; |
492 | | |
493 | 264k | for( int k = 0; k < 4; k++, trMat += 16 ) |
494 | 211k | { |
495 | 211k | const int8_t* trMatTmp = trMat; |
496 | 211k | int* srcPtr = src; |
497 | | |
498 | 211k | __m128i vsrc; |
499 | 211k | __m128i vtr; |
500 | 211k | __m128i vtmp; |
501 | 211k | __m128i vcur = vzero; |
502 | | |
503 | 609k | for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 ) |
504 | 398k | { |
505 | 398k | vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr ); |
506 | 398k | vtr = _mm_loadu_si64( ( const __m128i* ) trMatTmp ); |
507 | 398k | vtr = _mm_cvtepi8_epi16( vtr ); |
508 | 398k | vtmp = _mm_cvtepi16_epi32( vtr ); |
509 | | |
510 | 398k | vtmp = _mm_mullo_epi32( vsrc, vtmp ); |
511 | 398k | vcur = _mm_add_epi32( vtmp, vcur ); |
512 | | |
513 | 398k | vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] ); |
514 | 398k | vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) ); |
515 | | |
516 | 398k | vtmp = _mm_mullo_epi32( vsrc, vtmp ); |
517 | 398k | vcur = _mm_add_epi32( vtmp, vcur ); |
518 | 398k | } |
519 | | |
520 | 211k | vsum[k] = vcur; |
521 | 211k | } |
522 | | |
523 | 52.9k | __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) ); |
524 | 52.9k | vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) ); |
525 | 52.9k | vout = _mm_srai_epi32( vout, 7 ); |
526 | 52.9k | vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax ); |
527 | | |
528 | 52.9k | _mm_storeu_si128( ( __m128i* ) out, vout ); |
529 | 52.9k | } |
530 | 5.29k | } Unexecuted instantiation: Trafo_sse41.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int) Trafo_avx2.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int) Line | Count | Source | 475 | 5.29k | { | 476 | 5.29k | CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" ); | 477 | | | 478 | 5.29k | static constexpr int maxLog2TrDynamicRange = 15; | 479 | 5.29k | const TCoeff outputMinimum = -( 1 << maxLog2TrDynamicRange ); | 480 | 5.29k | const TCoeff outputMaximum = ( 1 << maxLog2TrDynamicRange ) - 1; | 481 | 5.29k | const int8_t* trMat = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0]; | 482 | 5.29k | const int trSize = ( size > 4 ) ? 48 : 16; | 483 | 5.29k | int* out = dst; | 484 | | | 485 | 5.29k | const __m128i vzero = _mm_setzero_si128(); | 486 | 5.29k | const __m128i vmin = _mm_set1_epi32( outputMinimum ); | 487 | 5.29k | const __m128i vmax = _mm_set1_epi32( outputMaximum ); | 488 | | | 489 | 58.2k | for( int j = 0; j < trSize; j += 4, out += 4 ) | 490 | 52.9k | { | 491 | 52.9k | __m128i vsum[4]; | 492 | | | 493 | 264k | for( int k = 0; k < 4; k++, trMat += 16 ) | 494 | 211k | { | 495 | 211k | const int8_t* trMatTmp = trMat; | 496 | 211k | int* srcPtr = src; | 497 | | | 498 | 211k | __m128i vsrc; | 499 | 211k | __m128i vtr; | 500 | 211k | __m128i vtmp; | 501 | 211k | __m128i vcur = vzero; | 502 | | | 503 | 609k | for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 ) | 504 | 398k | { | 505 | 398k | vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr ); | 506 | 398k | vtr = _mm_loadu_si64( ( const __m128i* ) trMatTmp ); | 507 | 398k | vtr = _mm_cvtepi8_epi16( vtr ); | 508 | 398k | vtmp = _mm_cvtepi16_epi32( vtr ); | 509 | | | 510 | 398k | vtmp = _mm_mullo_epi32( vsrc, vtmp ); | 511 | 398k | vcur = _mm_add_epi32( vtmp, vcur ); | 512 | | | 513 | 398k | vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] ); | 514 | 398k | vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) ); | 515 | | | 516 | 398k | vtmp = _mm_mullo_epi32( vsrc, vtmp ); | 517 | 398k | vcur = _mm_add_epi32( vtmp, vcur ); | 518 | 398k | } | 519 | | | 520 | 211k | vsum[k] = vcur; | 521 | 211k | } | 522 | | | 523 | 52.9k | __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) ); | 524 | 52.9k | vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) ); | 525 | 52.9k | vout = _mm_srai_epi32( vout, 7 ); | 526 | 52.9k | vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax ); | 527 | | | 528 | 52.9k | _mm_storeu_si128( ( __m128i* ) out, vout ); | 529 | 52.9k | } | 530 | 5.29k | } |
|
531 | | |
532 | | template<X86_VEXT vext> |
533 | | void TCoeffOps::_initTCoeffOpsX86() |
534 | 2.42k | { |
535 | 2.42k | cpyResiClip[2] = cpyResiClip_SSE<vext, 4>; |
536 | 2.42k | cpyResiClip[3] = cpyResiClip_SSE<vext, 8>; |
537 | 2.42k | cpyResiClip[4] = cpyResiClip_SSE<vext, 16>; |
538 | 2.42k | cpyResiClip[5] = cpyResiClip_SSE<vext, 32>; |
539 | 2.42k | cpyResiClip[6] = cpyResiClip_SSE<vext, 64>; |
540 | 2.42k | roundClip4 = roundClip_SSE<vext, 4>; |
541 | 2.42k | roundClip8 = roundClip_SSE<vext, 8>; |
542 | 2.42k | fastInvCore[0] = fastInv_SSE <vext, 4>; |
543 | 2.42k | fastInvCore[1] = fastInv_SSE <vext, 8>; |
544 | 2.42k | fastInvCore[2] = fastInv_SSE <vext, 16>; |
545 | 2.42k | fastInvCore[3] = fastInv_SSE <vext, 32>; |
546 | 2.42k | fastInvCore[4] = fastInv_SSE <vext, 64>; |
547 | 2.42k | } Unexecuted instantiation: void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)1>() void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)4>() Line | Count | Source | 534 | 2.42k | { | 535 | 2.42k | cpyResiClip[2] = cpyResiClip_SSE<vext, 4>; | 536 | 2.42k | cpyResiClip[3] = cpyResiClip_SSE<vext, 8>; | 537 | 2.42k | cpyResiClip[4] = cpyResiClip_SSE<vext, 16>; | 538 | 2.42k | cpyResiClip[5] = cpyResiClip_SSE<vext, 32>; | 539 | 2.42k | cpyResiClip[6] = cpyResiClip_SSE<vext, 64>; | 540 | 2.42k | roundClip4 = roundClip_SSE<vext, 4>; | 541 | 2.42k | roundClip8 = roundClip_SSE<vext, 8>; | 542 | 2.42k | fastInvCore[0] = fastInv_SSE <vext, 4>; | 543 | 2.42k | fastInvCore[1] = fastInv_SSE <vext, 8>; | 544 | 2.42k | fastInvCore[2] = fastInv_SSE <vext, 16>; | 545 | 2.42k | fastInvCore[3] = fastInv_SSE <vext, 32>; | 546 | 2.42k | fastInvCore[4] = fastInv_SSE <vext, 64>; | 547 | 2.42k | } |
|
548 | | |
549 | | template<X86_VEXT vext> |
550 | | void TrQuant::_initTrQuantX86() |
551 | 25.8k | { |
552 | 25.8k | m_invLfnstNxN = simdInvLfnstNxNCore<vext>; |
553 | 25.8k | } Unexecuted instantiation: void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)1>() void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)4>() Line | Count | Source | 551 | 25.8k | { | 552 | 25.8k | m_invLfnstNxN = simdInvLfnstNxNCore<vext>; | 553 | 25.8k | } |
|
554 | | |
555 | | template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>(); |
556 | | template void TrQuant::_initTrQuantX86<SIMDX86>(); |
557 | | |
558 | | #endif // TARGET_SIMD_X86 |
559 | | #endif |
560 | | |
561 | | } |