Coverage Report

Created: 2026-05-24 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/TrafoX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     TrafoX86.h
44
    \brief    SIMD trafo
45
*/
46
47
//! \ingroup CommonLib
48
//! \{
49
50
51
#include "CommonLib/CommonDef.h"
52
#include "CommonLib/Rom.h"
53
54
#include "CommonDefX86.h"
55
56
#include "TrQuant.h"
57
#include "TrQuant_EMT.h"
58
59
namespace vvdec
60
{
61
62
#if ENABLE_SIMD_TCOEFF_OPS
63
#ifdef TARGET_SIMD_X86
64
65
template< X86_VEXT vext, int trSize >
66
void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
67
37.4k
{
68
37.4k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
#if USE_AVX2
71
37.4k
  if( trSize >= 8 && vext >= AVX2 )
72
36.7k
  {
73
36.7k
    if( ( trSize & 15 ) == 0 )
74
29.9k
    {
75
29.9k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
136k
      for( int k = 0; k < rows; k += 2 )
78
106k
      {
79
106k
              TCoeff* dstPtr =  dst;
80
81
106k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
106k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
106k
        __m256i vsrc1v[trLoops][2];
85
        
86
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
307k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
201k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
201k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
201k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
201k
#endif
98
99
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
        }
102
103
523k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
417k
        {
105
417k
          __m128i xscale = maxLoopL == 4
106
417k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
417k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
417k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
417k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
1.74M
          for( int l = 0; l < maxLoopL; l++ )
113
1.39M
          {
114
1.39M
            __m256i
115
1.39M
            vscale = _mm256_broadcastd_epi32( xscale );
116
1.39M
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
4.47M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
3.08M
            {
120
3.08M
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
3.08M
              __m256i
123
3.08M
              vsrc1 = vsrc1v[col][0];
124
3.08M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
3.08M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
3.08M
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
3.08M
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
3.08M
              vsrc1 = vsrc1v[col][1];
132
3.08M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
3.08M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
3.08M
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
3.08M
            }
137
1.39M
          }
138
349k
        }
139
106k
      }
140
29.9k
    }
141
6.82k
    else
142
6.82k
    {
143
26.8k
      for( int k = 0; k < rows; k += 2 )
144
19.9k
      {
145
19.9k
              TCoeff* dstPtr  =  dst;
146
147
19.9k
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
19.9k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
19.9k
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
19.9k
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
19.9k
        __m256i vit;
154
155
19.9k
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
19.9k
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
19.9k
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
19.9k
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
19.9k
#endif
166
167
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
        }
169
        
170
67.9k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
48.0k
        {
172
48.0k
          __m128i xscale = maxLoopL == 4
173
48.0k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
48.0k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
48.0k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
48.0k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
197k
          for( int l = 0; l < maxLoopL; l++ )
180
157k
          {
181
157k
            __m256i
182
157k
            vscale = _mm256_broadcastd_epi32( xscale );
183
157k
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
315k
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
157k
            {
187
157k
              __m256i
188
157k
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
157k
              __m256i
190
157k
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
157k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
157k
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
157k
            }
195
157k
          }
196
39.6k
        }
197
19.9k
      }
198
6.82k
    }
199
36.7k
  }
200
#else
201
0
  if( trSize >= 8 )
202
0
  {
203
0
    for( int k = 0; k < rows; k += 2 )
204
0
    {
205
0
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
0
      {
212
0
        __m128i xscale = maxLoopL == 4
213
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
0
        for( int l = 0; l < maxLoopL; l++ )
220
0
        {
221
0
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
0
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
0
          __m128i
225
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
0
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
0
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
0
          {
230
0
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
0
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
0
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
0
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
0
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
0
          }
255
0
        }
256
0
      }
257
0
    }
258
0
  }
259
0
#endif
260
690
  else if( trSize >= 4 )
261
690
  {
262
690
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
1.88k
    for( int k = 0; k < rows; k += 2 )
265
1.19k
    {
266
1.19k
            TCoeff* dstPtr  =  dst;
267
268
1.19k
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
1.19k
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
1.19k
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
1.19k
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
1.19k
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
4.83k
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
3.63k
      {
278
3.63k
        __m128i xscale = maxLoopL == 4
279
3.63k
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
3.63k
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
3.63k
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
3.63k
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
14.3k
        for( int l = 0; l < maxLoopL; l++ )
286
11.4k
        {
287
11.4k
          __m128i
288
11.4k
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
11.4k
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
22.8k
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
11.4k
          {
293
11.4k
            __m128i
294
11.4k
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
11.4k
            __m128i 
296
11.4k
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
11.4k
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
11.4k
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
11.4k
          }
301
11.4k
        }
302
2.93k
      }
303
1.19k
    }
304
690
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
#if USE_AVX2
310
311
37.4k
  _mm256_zeroupper();
312
37.4k
#endif
313
37.4k
}
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
690
{
68
690
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
690
#if USE_AVX2
71
690
  if( trSize >= 8 && vext >= AVX2 )
72
0
  {
73
0
    if( ( trSize & 15 ) == 0 )
74
0
    {
75
0
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
0
      for( int k = 0; k < rows; k += 2 )
78
0
      {
79
0
              TCoeff* dstPtr =  dst;
80
81
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
0
        __m256i vsrc1v[trLoops][2];
85
        
86
0
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
0
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
0
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
0
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
0
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
0
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
0
#endif
98
99
0
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
0
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
0
        }
102
103
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
0
        {
105
0
          __m128i xscale = maxLoopL == 4
106
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
0
          for( int l = 0; l < maxLoopL; l++ )
113
0
          {
114
0
            __m256i
115
0
            vscale = _mm256_broadcastd_epi32( xscale );
116
0
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
0
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
0
            {
120
0
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
0
              __m256i
123
0
              vsrc1 = vsrc1v[col][0];
124
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
0
              vsrc1 = vsrc1v[col][1];
132
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
0
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
0
            }
137
0
          }
138
0
        }
139
0
      }
140
0
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
0
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
690
  else if( trSize >= 4 )
261
690
  {
262
690
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
1.88k
    for( int k = 0; k < rows; k += 2 )
265
1.19k
    {
266
1.19k
            TCoeff* dstPtr  =  dst;
267
268
1.19k
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
1.19k
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
1.19k
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
1.19k
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
1.19k
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
4.83k
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
3.63k
      {
278
3.63k
        __m128i xscale = maxLoopL == 4
279
3.63k
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
3.63k
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
3.63k
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
3.63k
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
14.3k
        for( int l = 0; l < maxLoopL; l++ )
286
11.4k
        {
287
11.4k
          __m128i
288
11.4k
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
11.4k
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
22.8k
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
11.4k
          {
293
11.4k
            __m128i
294
11.4k
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
11.4k
            __m128i 
296
11.4k
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
11.4k
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
11.4k
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
11.4k
          }
301
11.4k
        }
302
2.93k
      }
303
1.19k
    }
304
690
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
690
#if USE_AVX2
310
311
690
  _mm256_zeroupper();
312
690
#endif
313
690
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
6.82k
{
68
6.82k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
6.82k
#if USE_AVX2
71
6.82k
  if( trSize >= 8 && vext >= AVX2 )
72
6.82k
  {
73
6.82k
    if( ( trSize & 15 ) == 0 )
74
0
    {
75
0
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
0
      for( int k = 0; k < rows; k += 2 )
78
0
      {
79
0
              TCoeff* dstPtr =  dst;
80
81
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
0
        __m256i vsrc1v[trLoops][2];
85
        
86
0
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
0
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
0
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
0
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
0
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
0
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
0
#endif
98
99
0
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
0
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
0
        }
102
103
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
0
        {
105
0
          __m128i xscale = maxLoopL == 4
106
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
0
          for( int l = 0; l < maxLoopL; l++ )
113
0
          {
114
0
            __m256i
115
0
            vscale = _mm256_broadcastd_epi32( xscale );
116
0
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
0
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
0
            {
120
0
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
0
              __m256i
123
0
              vsrc1 = vsrc1v[col][0];
124
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
0
              vsrc1 = vsrc1v[col][1];
132
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
0
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
0
            }
137
0
          }
138
0
        }
139
0
      }
140
0
    }
141
6.82k
    else
142
6.82k
    {
143
26.8k
      for( int k = 0; k < rows; k += 2 )
144
19.9k
      {
145
19.9k
              TCoeff* dstPtr  =  dst;
146
147
19.9k
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
19.9k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
19.9k
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
19.9k
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
19.9k
        __m256i vit;
154
155
19.9k
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
19.9k
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
19.9k
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
19.9k
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
19.9k
#endif
166
167
19.9k
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
19.9k
        }
169
        
170
67.9k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
48.0k
        {
172
48.0k
          __m128i xscale = maxLoopL == 4
173
48.0k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
48.0k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
48.0k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
48.0k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
197k
          for( int l = 0; l < maxLoopL; l++ )
180
157k
          {
181
157k
            __m256i
182
157k
            vscale = _mm256_broadcastd_epi32( xscale );
183
157k
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
315k
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
157k
            {
187
157k
              __m256i
188
157k
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
157k
              __m256i
190
157k
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
157k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
157k
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
157k
            }
195
157k
          }
196
39.6k
        }
197
19.9k
      }
198
6.82k
    }
199
6.82k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
6.82k
#if USE_AVX2
310
311
6.82k
  _mm256_zeroupper();
312
6.82k
#endif
313
6.82k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
9.32k
{
68
9.32k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
9.32k
#if USE_AVX2
71
9.32k
  if( trSize >= 8 && vext >= AVX2 )
72
9.32k
  {
73
9.32k
    if( ( trSize & 15 ) == 0 )
74
9.32k
    {
75
9.32k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
38.7k
      for( int k = 0; k < rows; k += 2 )
78
29.4k
      {
79
29.4k
              TCoeff* dstPtr =  dst;
80
81
29.4k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
29.4k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
29.4k
        __m256i vsrc1v[trLoops][2];
85
        
86
29.4k
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
29.4k
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
58.8k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
29.4k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
29.4k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
29.4k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
29.4k
#endif
98
99
29.4k
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
29.4k
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
29.4k
        }
102
103
113k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
83.7k
        {
105
83.7k
          __m128i xscale = maxLoopL == 4
106
83.7k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
83.7k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
83.7k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
83.7k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
342k
          for( int l = 0; l < maxLoopL; l++ )
113
273k
          {
114
273k
            __m256i
115
273k
            vscale = _mm256_broadcastd_epi32( xscale );
116
273k
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
547k
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
273k
            {
120
273k
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
273k
              __m256i
123
273k
              vsrc1 = vsrc1v[col][0];
124
273k
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
273k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
273k
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
273k
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
273k
              vsrc1 = vsrc1v[col][1];
132
273k
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
273k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
273k
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
273k
            }
137
273k
          }
138
68.7k
        }
139
29.4k
      }
140
9.32k
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
9.32k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
9.32k
#if USE_AVX2
310
311
9.32k
  _mm256_zeroupper();
312
9.32k
#endif
313
9.32k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
18.0k
{
68
18.0k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
18.0k
#if USE_AVX2
71
18.0k
  if( trSize >= 8 && vext >= AVX2 )
72
18.0k
  {
73
18.0k
    if( ( trSize & 15 ) == 0 )
74
18.0k
    {
75
18.0k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
85.6k
      for( int k = 0; k < rows; k += 2 )
78
67.6k
      {
79
67.6k
              TCoeff* dstPtr =  dst;
80
81
67.6k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
67.6k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
67.6k
        __m256i vsrc1v[trLoops][2];
85
        
86
67.6k
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
67.6k
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
202k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
135k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
135k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
135k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
135k
#endif
98
99
135k
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
135k
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
135k
        }
102
103
318k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
250k
        {
105
250k
          __m128i xscale = maxLoopL == 4
106
250k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
250k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
250k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
250k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
1.04M
          for( int l = 0; l < maxLoopL; l++ )
113
834k
          {
114
834k
            __m256i
115
834k
            vscale = _mm256_broadcastd_epi32( xscale );
116
834k
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
2.50M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
1.66M
            {
120
1.66M
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
1.66M
              __m256i
123
1.66M
              vsrc1 = vsrc1v[col][0];
124
1.66M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
1.66M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
1.66M
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
1.66M
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
1.66M
              vsrc1 = vsrc1v[col][1];
132
1.66M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
1.66M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
1.66M
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
1.66M
            }
137
834k
          }
138
209k
        }
139
67.6k
      }
140
18.0k
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
18.0k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
18.0k
#if USE_AVX2
310
311
18.0k
  _mm256_zeroupper();
312
18.0k
#endif
313
18.0k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
2.58k
{
68
2.58k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
2.58k
#if USE_AVX2
71
2.58k
  if( trSize >= 8 && vext >= AVX2 )
72
2.58k
  {
73
2.58k
    if( ( trSize & 15 ) == 0 )
74
2.58k
    {
75
2.58k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
11.7k
      for( int k = 0; k < rows; k += 2 )
78
9.17k
      {
79
9.17k
              TCoeff* dstPtr =  dst;
80
81
9.17k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
9.17k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
9.17k
        __m256i vsrc1v[trLoops][2];
85
        
86
9.17k
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
9.17k
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
45.8k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
36.6k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
36.6k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
36.6k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
36.6k
#endif
98
99
36.6k
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
36.6k
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
36.6k
        }
102
103
92.5k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
83.4k
        {
105
83.4k
          __m128i xscale = maxLoopL == 4
106
83.4k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
83.4k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
83.4k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
83.4k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
357k
          for( int l = 0; l < maxLoopL; l++ )
113
285k
          {
114
285k
            __m256i
115
285k
            vscale = _mm256_broadcastd_epi32( xscale );
116
285k
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
1.42M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
1.14M
            {
120
1.14M
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
1.14M
              __m256i
123
1.14M
              vsrc1 = vsrc1v[col][0];
124
1.14M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
1.14M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
1.14M
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
1.14M
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
1.14M
              vsrc1 = vsrc1v[col][1];
132
1.14M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
1.14M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
1.14M
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
1.14M
            }
137
285k
          }
138
71.5k
        }
139
9.17k
      }
140
2.58k
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
2.58k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
2.58k
#if USE_AVX2
310
311
2.58k
  _mm256_zeroupper();
312
2.58k
#endif
313
2.58k
}
314
315
template< X86_VEXT vext, int W >
316
void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
317
19.9k
{
318
#if USE_AVX2
319
19.9k
  if( W >= 8 && vext >= AVX2 )
320
18.0k
  {
321
18.0k
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
18.0k
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
18.0k
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
127k
    while( height-- )
326
109k
    {
327
468k
      for( int col = 0; col < width; col += 8 )
328
359k
      {
329
359k
        __m256i
330
359k
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
359k
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
359k
        vdst = _mm256_srai_epi32( vdst, shift );
333
359k
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
359k
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
359k
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
359k
      }
337
338
109k
      dst += stride;
339
109k
    }
340
18.0k
  }
341
1.94k
  else
342
1.94k
#endif
343
1.94k
  if( W >= 4 )
344
1.94k
  {
345
1.94k
    __m128i vmin = _mm_set1_epi32( outputMin );
346
1.94k
    __m128i vmax = _mm_set1_epi32( outputMax );
347
1.94k
    __m128i vrnd = _mm_set1_epi32( round );
348
349
13.2k
    while( height-- )
350
11.3k
    {
351
22.6k
      for( int col = 0; col < width; col += 4 )
352
11.3k
      {
353
11.3k
        __m128i
354
11.3k
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
11.3k
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
11.3k
        vdst = _mm_srai_epi32 ( vdst, shift );
357
11.3k
        vdst = _mm_max_epi32  ( vdst, vmin );
358
11.3k
        vdst = _mm_min_epi32  ( vdst, vmax );
359
11.3k
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
11.3k
      }
361
362
11.3k
      dst += stride;
363
11.3k
    }
364
1.94k
  }
365
0
  else
366
0
  {
367
0
    THROW_FATAL( "Unsupported size" );
368
0
  }
369
#if USE_AVX2
370
371
19.9k
  _mm256_zeroupper();
372
19.9k
#endif
373
19.9k
}
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
317
1.94k
{
318
1.94k
#if USE_AVX2
319
1.94k
  if( W >= 8 && vext >= AVX2 )
320
0
  {
321
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
0
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
0
    while( height-- )
326
0
    {
327
0
      for( int col = 0; col < width; col += 8 )
328
0
      {
329
0
        __m256i
330
0
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
0
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
0
        vdst = _mm256_srai_epi32( vdst, shift );
333
0
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
0
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
0
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
0
      }
337
338
0
      dst += stride;
339
0
    }
340
0
  }
341
1.94k
  else
342
1.94k
#endif
343
1.94k
  if( W >= 4 )
344
1.94k
  {
345
1.94k
    __m128i vmin = _mm_set1_epi32( outputMin );
346
1.94k
    __m128i vmax = _mm_set1_epi32( outputMax );
347
1.94k
    __m128i vrnd = _mm_set1_epi32( round );
348
349
13.2k
    while( height-- )
350
11.3k
    {
351
22.6k
      for( int col = 0; col < width; col += 4 )
352
11.3k
      {
353
11.3k
        __m128i
354
11.3k
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
11.3k
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
11.3k
        vdst = _mm_srai_epi32 ( vdst, shift );
357
11.3k
        vdst = _mm_max_epi32  ( vdst, vmin );
358
11.3k
        vdst = _mm_min_epi32  ( vdst, vmax );
359
11.3k
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
11.3k
      }
361
362
11.3k
      dst += stride;
363
11.3k
    }
364
1.94k
  }
365
0
  else
366
0
  {
367
0
    THROW_FATAL( "Unsupported size" );
368
0
  }
369
1.94k
#if USE_AVX2
370
371
1.94k
  _mm256_zeroupper();
372
1.94k
#endif
373
1.94k
}
void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
317
18.0k
{
318
18.0k
#if USE_AVX2
319
18.0k
  if( W >= 8 && vext >= AVX2 )
320
18.0k
  {
321
18.0k
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
18.0k
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
18.0k
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
127k
    while( height-- )
326
109k
    {
327
468k
      for( int col = 0; col < width; col += 8 )
328
359k
      {
329
359k
        __m256i
330
359k
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
359k
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
359k
        vdst = _mm256_srai_epi32( vdst, shift );
333
359k
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
359k
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
359k
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
359k
      }
337
338
109k
      dst += stride;
339
109k
    }
340
18.0k
  }
341
0
  else
342
0
#endif
343
0
  if( W >= 4 )
344
0
  {
345
0
    __m128i vmin = _mm_set1_epi32( outputMin );
346
0
    __m128i vmax = _mm_set1_epi32( outputMax );
347
0
    __m128i vrnd = _mm_set1_epi32( round );
348
349
0
    while( height-- )
350
0
    {
351
0
      for( int col = 0; col < width; col += 4 )
352
0
      {
353
0
        __m128i
354
0
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
0
        vdst = _mm_srai_epi32 ( vdst, shift );
357
0
        vdst = _mm_max_epi32  ( vdst, vmin );
358
0
        vdst = _mm_min_epi32  ( vdst, vmax );
359
0
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
0
      }
361
362
0
      dst += stride;
363
0
    }
364
0
  }
365
0
  else
366
0
  {
367
0
    THROW_FATAL( "Unsupported size" );
368
0
  }
369
18.0k
#if USE_AVX2
370
371
18.0k
  _mm256_zeroupper();
372
18.0k
#endif
373
18.0k
}
374
375
template< X86_VEXT vext, int W >
376
void cpyResiClip_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
377
20.2k
{
378
#if USE_AVX2
379
20.2k
  if( W >= 16 )
380
15.5k
  {
381
15.5k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
15.5k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
15.5k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
414k
    while( height-- )
386
399k
    {
387
1.27M
      for( int col = 0; col < width; col += 16 )
388
877k
      {
389
877k
        __m256i
390
877k
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
877k
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
877k
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
877k
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
877k
        __m256i                
396
877k
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
877k
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
877k
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
877k
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
877k
        __m256i
402
877k
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
877k
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
877k
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
877k
      }
406
407
399k
      src += width;
408
399k
      dst += stride;
409
399k
    }
410
15.5k
  }
411
4.71k
  else
412
4.71k
#endif
413
4.71k
  if( W >= 8 )
414
3.21k
  {
415
3.21k
    __m128i vmin = _mm_set1_epi32( outputMin );
416
3.21k
    __m128i vmax = _mm_set1_epi32( outputMax );
417
3.21k
    __m128i vrnd = _mm_set1_epi32( round );
418
419
46.8k
    while( height-- )
420
43.6k
    {
421
87.3k
      for( int col = 0; col < width; col += 8 )
422
43.6k
      {
423
43.6k
        __m128i
424
43.6k
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
43.6k
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
43.6k
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
43.6k
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
43.6k
        __m128i                
429
43.6k
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
43.6k
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
43.6k
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
43.6k
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
43.6k
        __m128i
434
43.6k
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
43.6k
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
43.6k
      }
437
438
43.6k
      src += width;
439
43.6k
      dst += stride;
440
43.6k
    }
441
3.21k
  }
442
1.49k
  else if( W >= 4 )
443
1.49k
  {
444
1.49k
    __m128i vmin = _mm_set1_epi32( outputMin );
445
1.49k
    __m128i vmax = _mm_set1_epi32( outputMax );
446
1.49k
    __m128i vrnd = _mm_set1_epi32( round );
447
448
1.49k
    __m128i vzero = _mm_setzero_si128();
449
1.49k
    __m128i vdst;
450
451
19.8k
    while( height-- )
452
18.3k
    {
453
36.7k
      for( int col = 0; col < width; col += 4 )
454
18.3k
      {
455
18.3k
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
18.3k
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
18.3k
        vdst = _mm_srai_epi32 ( vdst, shift );
458
18.3k
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
18.3k
        vdst = _mm_packs_epi32( vdst, vzero );
460
18.3k
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
18.3k
      }
462
463
18.3k
      src += width;
464
18.3k
      dst += stride;
465
18.3k
    }
466
1.49k
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
20.2k
}
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
1.49k
{
378
1.49k
#if USE_AVX2
379
1.49k
  if( W >= 16 )
380
0
  {
381
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
0
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
0
    while( height-- )
386
0
    {
387
0
      for( int col = 0; col < width; col += 16 )
388
0
      {
389
0
        __m256i
390
0
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
0
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
0
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
0
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
0
        __m256i                
396
0
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
0
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
0
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
0
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
0
        __m256i
402
0
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
0
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
0
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
0
      }
406
407
0
      src += width;
408
0
      dst += stride;
409
0
    }
410
0
  }
411
1.49k
  else
412
1.49k
#endif
413
1.49k
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
1.49k
  else if( W >= 4 )
443
1.49k
  {
444
1.49k
    __m128i vmin = _mm_set1_epi32( outputMin );
445
1.49k
    __m128i vmax = _mm_set1_epi32( outputMax );
446
1.49k
    __m128i vrnd = _mm_set1_epi32( round );
447
448
1.49k
    __m128i vzero = _mm_setzero_si128();
449
1.49k
    __m128i vdst;
450
451
19.8k
    while( height-- )
452
18.3k
    {
453
36.7k
      for( int col = 0; col < width; col += 4 )
454
18.3k
      {
455
18.3k
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
18.3k
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
18.3k
        vdst = _mm_srai_epi32 ( vdst, shift );
458
18.3k
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
18.3k
        vdst = _mm_packs_epi32( vdst, vzero );
460
18.3k
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
18.3k
      }
462
463
18.3k
      src += width;
464
18.3k
      dst += stride;
465
18.3k
    }
466
1.49k
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
1.49k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
3.21k
{
378
3.21k
#if USE_AVX2
379
3.21k
  if( W >= 16 )
380
0
  {
381
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
0
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
0
    while( height-- )
386
0
    {
387
0
      for( int col = 0; col < width; col += 16 )
388
0
      {
389
0
        __m256i
390
0
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
0
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
0
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
0
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
0
        __m256i                
396
0
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
0
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
0
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
0
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
0
        __m256i
402
0
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
0
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
0
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
0
      }
406
407
0
      src += width;
408
0
      dst += stride;
409
0
    }
410
0
  }
411
3.21k
  else
412
3.21k
#endif
413
3.21k
  if( W >= 8 )
414
3.21k
  {
415
3.21k
    __m128i vmin = _mm_set1_epi32( outputMin );
416
3.21k
    __m128i vmax = _mm_set1_epi32( outputMax );
417
3.21k
    __m128i vrnd = _mm_set1_epi32( round );
418
419
46.8k
    while( height-- )
420
43.6k
    {
421
87.3k
      for( int col = 0; col < width; col += 8 )
422
43.6k
      {
423
43.6k
        __m128i
424
43.6k
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
43.6k
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
43.6k
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
43.6k
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
43.6k
        __m128i                
429
43.6k
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
43.6k
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
43.6k
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
43.6k
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
43.6k
        __m128i
434
43.6k
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
43.6k
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
43.6k
      }
437
438
43.6k
      src += width;
439
43.6k
      dst += stride;
440
43.6k
    }
441
3.21k
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
3.21k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
4.74k
{
378
4.74k
#if USE_AVX2
379
4.74k
  if( W >= 16 )
380
4.74k
  {
381
4.74k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
4.74k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
4.74k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
80.9k
    while( height-- )
386
76.2k
    {
387
152k
      for( int col = 0; col < width; col += 16 )
388
76.2k
      {
389
76.2k
        __m256i
390
76.2k
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
76.2k
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
76.2k
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
76.2k
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
76.2k
        __m256i                
396
76.2k
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
76.2k
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
76.2k
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
76.2k
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
76.2k
        __m256i
402
76.2k
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
76.2k
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
76.2k
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
76.2k
      }
406
407
76.2k
      src += width;
408
76.2k
      dst += stride;
409
76.2k
    }
410
4.74k
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
4.74k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
9.45k
{
378
9.45k
#if USE_AVX2
379
9.45k
  if( W >= 16 )
380
9.45k
  {
381
9.45k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
9.45k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
9.45k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
254k
    while( height-- )
386
245k
    {
387
735k
      for( int col = 0; col < width; col += 16 )
388
490k
      {
389
490k
        __m256i
390
490k
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
490k
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
490k
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
490k
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
490k
        __m256i                
396
490k
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
490k
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
490k
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
490k
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
490k
        __m256i
402
490k
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
490k
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
490k
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
490k
      }
406
407
245k
      src += width;
408
245k
      dst += stride;
409
245k
    }
410
9.45k
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
9.45k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
1.31k
{
378
1.31k
#if USE_AVX2
379
1.31k
  if( W >= 16 )
380
1.31k
  {
381
1.31k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
1.31k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
1.31k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
78.9k
    while( height-- )
386
77.6k
    {
387
388k
      for( int col = 0; col < width; col += 16 )
388
310k
      {
389
310k
        __m256i
390
310k
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
310k
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
310k
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
310k
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
310k
        __m256i                
396
310k
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
310k
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
310k
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
310k
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
310k
        __m256i
402
310k
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
310k
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
310k
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
310k
      }
406
407
77.6k
      src += width;
408
77.6k
      dst += stride;
409
77.6k
    }
410
1.31k
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
1.31k
}
472
473
template<X86_VEXT vext>
474
static void simdInvLfnstNxNCore( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize )
475
5.29k
{
476
5.29k
  CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" );
477
478
5.29k
  static constexpr int maxLog2TrDynamicRange = 15;
479
5.29k
  const TCoeff    outputMinimum = -( 1 << maxLog2TrDynamicRange );
480
5.29k
  const TCoeff    outputMaximum =  ( 1 << maxLog2TrDynamicRange ) - 1;
481
5.29k
  const int8_t*   trMat         = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0];
482
5.29k
  const int       trSize        = ( size > 4 ) ? 48 : 16;
483
5.29k
  int*            out           = dst;
484
485
5.29k
  const __m128i vzero = _mm_setzero_si128();
486
5.29k
  const __m128i vmin  = _mm_set1_epi32( outputMinimum );
487
5.29k
  const __m128i vmax  = _mm_set1_epi32( outputMaximum );
488
489
58.2k
  for( int j = 0; j < trSize; j += 4, out += 4 )
490
52.9k
  {
491
52.9k
    __m128i       vsum[4];
492
493
264k
    for( int k = 0; k < 4; k++, trMat += 16 )
494
211k
    {
495
211k
      const int8_t* trMatTmp = trMat;
496
211k
      int* srcPtr = src;
497
498
211k
      __m128i vsrc;
499
211k
      __m128i vtr;
500
211k
      __m128i vtmp;
501
211k
      __m128i vcur = vzero;
502
503
609k
      for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 )
504
398k
      {
505
398k
        vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr );
506
398k
        vtr  = _mm_loadu_si64( ( const __m128i* ) trMatTmp );
507
398k
        vtr  = _mm_cvtepi8_epi16( vtr );
508
398k
        vtmp = _mm_cvtepi16_epi32( vtr );
509
510
398k
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
511
398k
        vcur = _mm_add_epi32( vtmp, vcur );
512
513
398k
        vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
514
398k
        vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) );
515
      
516
398k
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
517
398k
        vcur = _mm_add_epi32( vtmp, vcur );
518
398k
      }
519
520
211k
      vsum[k] = vcur;
521
211k
    }
522
523
52.9k
    __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) );
524
52.9k
    vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) );
525
52.9k
    vout = _mm_srai_epi32( vout, 7 );
526
52.9k
    vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax );
527
528
52.9k
    _mm_storeu_si128( ( __m128i* ) out, vout );
529
52.9k
  }
530
5.29k
}
Unexecuted instantiation: Trafo_sse41.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Trafo_avx2.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
475
5.29k
{
476
5.29k
  CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" );
477
478
5.29k
  static constexpr int maxLog2TrDynamicRange = 15;
479
5.29k
  const TCoeff    outputMinimum = -( 1 << maxLog2TrDynamicRange );
480
5.29k
  const TCoeff    outputMaximum =  ( 1 << maxLog2TrDynamicRange ) - 1;
481
5.29k
  const int8_t*   trMat         = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0];
482
5.29k
  const int       trSize        = ( size > 4 ) ? 48 : 16;
483
5.29k
  int*            out           = dst;
484
485
5.29k
  const __m128i vzero = _mm_setzero_si128();
486
5.29k
  const __m128i vmin  = _mm_set1_epi32( outputMinimum );
487
5.29k
  const __m128i vmax  = _mm_set1_epi32( outputMaximum );
488
489
58.2k
  for( int j = 0; j < trSize; j += 4, out += 4 )
490
52.9k
  {
491
52.9k
    __m128i       vsum[4];
492
493
264k
    for( int k = 0; k < 4; k++, trMat += 16 )
494
211k
    {
495
211k
      const int8_t* trMatTmp = trMat;
496
211k
      int* srcPtr = src;
497
498
211k
      __m128i vsrc;
499
211k
      __m128i vtr;
500
211k
      __m128i vtmp;
501
211k
      __m128i vcur = vzero;
502
503
609k
      for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 )
504
398k
      {
505
398k
        vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr );
506
398k
        vtr  = _mm_loadu_si64( ( const __m128i* ) trMatTmp );
507
398k
        vtr  = _mm_cvtepi8_epi16( vtr );
508
398k
        vtmp = _mm_cvtepi16_epi32( vtr );
509
510
398k
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
511
398k
        vcur = _mm_add_epi32( vtmp, vcur );
512
513
398k
        vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
514
398k
        vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) );
515
      
516
398k
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
517
398k
        vcur = _mm_add_epi32( vtmp, vcur );
518
398k
      }
519
520
211k
      vsum[k] = vcur;
521
211k
    }
522
523
52.9k
    __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) );
524
52.9k
    vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) );
525
52.9k
    vout = _mm_srai_epi32( vout, 7 );
526
52.9k
    vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax );
527
528
52.9k
    _mm_storeu_si128( ( __m128i* ) out, vout );
529
52.9k
  }
530
5.29k
}
531
532
template<X86_VEXT vext>
533
void TCoeffOps::_initTCoeffOpsX86()
534
2.42k
{
535
2.42k
  cpyResiClip[2] = cpyResiClip_SSE<vext,  4>;
536
2.42k
  cpyResiClip[3] = cpyResiClip_SSE<vext,  8>;
537
2.42k
  cpyResiClip[4] = cpyResiClip_SSE<vext, 16>;
538
2.42k
  cpyResiClip[5] = cpyResiClip_SSE<vext, 32>;
539
2.42k
  cpyResiClip[6] = cpyResiClip_SSE<vext, 64>;
540
2.42k
  roundClip4     = roundClip_SSE<vext,  4>;
541
2.42k
  roundClip8     = roundClip_SSE<vext,  8>;
542
2.42k
  fastInvCore[0] = fastInv_SSE  <vext,  4>;
543
2.42k
  fastInvCore[1] = fastInv_SSE  <vext,  8>;
544
2.42k
  fastInvCore[2] = fastInv_SSE  <vext, 16>;
545
2.42k
  fastInvCore[3] = fastInv_SSE  <vext, 32>;
546
2.42k
  fastInvCore[4] = fastInv_SSE  <vext, 64>;
547
2.42k
}
Unexecuted instantiation: void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
534
2.42k
{
535
2.42k
  cpyResiClip[2] = cpyResiClip_SSE<vext,  4>;
536
2.42k
  cpyResiClip[3] = cpyResiClip_SSE<vext,  8>;
537
2.42k
  cpyResiClip[4] = cpyResiClip_SSE<vext, 16>;
538
2.42k
  cpyResiClip[5] = cpyResiClip_SSE<vext, 32>;
539
2.42k
  cpyResiClip[6] = cpyResiClip_SSE<vext, 64>;
540
2.42k
  roundClip4     = roundClip_SSE<vext,  4>;
541
2.42k
  roundClip8     = roundClip_SSE<vext,  8>;
542
2.42k
  fastInvCore[0] = fastInv_SSE  <vext,  4>;
543
2.42k
  fastInvCore[1] = fastInv_SSE  <vext,  8>;
544
2.42k
  fastInvCore[2] = fastInv_SSE  <vext, 16>;
545
2.42k
  fastInvCore[3] = fastInv_SSE  <vext, 32>;
546
2.42k
  fastInvCore[4] = fastInv_SSE  <vext, 64>;
547
2.42k
}
548
549
template<X86_VEXT vext>
550
void TrQuant::_initTrQuantX86()
551
25.8k
{
552
25.8k
  m_invLfnstNxN = simdInvLfnstNxNCore<vext>;
553
25.8k
}
Unexecuted instantiation: void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
551
25.8k
{
552
25.8k
  m_invLfnstNxN = simdInvLfnstNxNCore<vext>;
553
25.8k
}
554
555
template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>();
556
template void TrQuant::_initTrQuantX86<SIMDX86>();
557
558
#endif // TARGET_SIMD_X86
559
#endif
560
561
}