Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/TrafoX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     TrafoX86.h
44
    \brief    SIMD trafo
45
*/
46
47
//! \ingroup CommonLib
48
//! \{
49
50
51
#include "CommonLib/CommonDef.h"
52
#include "CommonLib/Rom.h"
53
54
#include "CommonDefX86.h"
55
56
#include "TrQuant.h"
57
#include "TrQuant_EMT.h"
58
59
namespace vvdec
60
{
61
62
#if ENABLE_SIMD_TCOEFF_OPS
63
#ifdef TARGET_SIMD_X86
64
65
template< X86_VEXT vext, int trSize >
66
void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
67
111k
{
68
111k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
#if USE_AVX2
71
111k
  if( trSize >= 8 && vext >= AVX2 )
72
107k
  {
73
107k
    if( ( trSize & 15 ) == 0 )
74
81.0k
    {
75
81.0k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
368k
      for( int k = 0; k < rows; k += 2 )
78
287k
      {
79
287k
              TCoeff* dstPtr =  dst;
80
81
287k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
287k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
287k
        __m256i vsrc1v[trLoops][2];
85
        
86
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
822k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
534k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
534k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
534k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
534k
#endif
98
99
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
        }
102
103
1.38M
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
1.09M
        {
105
1.09M
          __m128i xscale = maxLoopL == 4
106
1.09M
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
1.09M
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
1.09M
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
1.09M
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
4.57M
          for( int l = 0; l < maxLoopL; l++ )
113
3.65M
          {
114
3.65M
            __m256i
115
3.65M
            vscale = _mm256_broadcastd_epi32( xscale );
116
3.65M
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
11.7M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
8.08M
            {
120
8.08M
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
8.08M
              __m256i
123
8.08M
              vsrc1 = vsrc1v[col][0];
124
8.08M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
8.08M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
8.08M
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
8.08M
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
8.08M
              vsrc1 = vsrc1v[col][1];
132
8.08M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
8.08M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
8.08M
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
8.08M
            }
137
3.65M
          }
138
917k
        }
139
287k
      }
140
81.0k
    }
141
26.9k
    else
142
26.9k
    {
143
106k
      for( int k = 0; k < rows; k += 2 )
144
79.1k
      {
145
79.1k
              TCoeff* dstPtr  =  dst;
146
147
79.1k
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
79.1k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
79.1k
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
79.1k
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
79.1k
        __m256i vit;
154
155
79.1k
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
79.1k
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
79.1k
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
79.1k
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
79.1k
#endif
166
167
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
        }
169
        
170
276k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
196k
        {
172
196k
          __m128i xscale = maxLoopL == 4
173
196k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
196k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
196k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
196k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
808k
          for( int l = 0; l < maxLoopL; l++ )
180
645k
          {
181
645k
            __m256i
182
645k
            vscale = _mm256_broadcastd_epi32( xscale );
183
645k
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
1.29M
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
645k
            {
187
645k
              __m256i
188
645k
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
645k
              __m256i
190
645k
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
645k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
645k
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
645k
            }
195
645k
          }
196
162k
        }
197
79.1k
      }
198
26.9k
    }
199
107k
  }
200
#else
201
0
  if( trSize >= 8 )
202
0
  {
203
0
    for( int k = 0; k < rows; k += 2 )
204
0
    {
205
0
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
0
      {
212
0
        __m128i xscale = maxLoopL == 4
213
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
0
        for( int l = 0; l < maxLoopL; l++ )
220
0
        {
221
0
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
0
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
0
          __m128i
225
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
0
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
0
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
0
          {
230
0
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
0
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
0
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
0
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
0
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
0
          }
255
0
        }
256
0
      }
257
0
    }
258
0
  }
259
0
#endif
260
3.49k
  else if( trSize >= 4 )
261
3.49k
  {
262
3.49k
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
9.62k
    for( int k = 0; k < rows; k += 2 )
265
6.12k
    {
266
6.12k
            TCoeff* dstPtr  =  dst;
267
268
6.12k
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
6.12k
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
6.12k
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
6.12k
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
6.12k
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
24.2k
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
18.0k
      {
278
18.0k
        __m128i xscale = maxLoopL == 4
279
18.0k
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
18.0k
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
18.0k
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
18.0k
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
72.3k
        for( int l = 0; l < maxLoopL; l++ )
286
57.5k
        {
287
57.5k
          __m128i
288
57.5k
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
57.5k
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
115k
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
57.5k
          {
293
57.5k
            __m128i
294
57.5k
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
57.5k
            __m128i 
296
57.5k
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
57.5k
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
57.5k
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
57.5k
          }
301
57.5k
        }
302
14.7k
      }
303
6.12k
    }
304
3.49k
  }
305
18.4E
  else
306
18.4E
  {
307
18.4E
    THROW_FATAL( "Unsupported size" );
308
18.4E
  }
309
#if USE_AVX2
310
311
111k
  _mm256_zeroupper();
312
111k
#endif
313
111k
}
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
3.49k
{
68
3.49k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
3.49k
#if USE_AVX2
71
3.49k
  if( trSize >= 8 && vext >= AVX2 )
72
0
  {
73
0
    if( ( trSize & 15 ) == 0 )
74
0
    {
75
0
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
0
      for( int k = 0; k < rows; k += 2 )
78
0
      {
79
0
              TCoeff* dstPtr =  dst;
80
81
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
0
        __m256i vsrc1v[trLoops][2];
85
        
86
0
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
0
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
0
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
0
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
0
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
0
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
0
#endif
98
99
0
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
0
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
0
        }
102
103
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
0
        {
105
0
          __m128i xscale = maxLoopL == 4
106
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
0
          for( int l = 0; l < maxLoopL; l++ )
113
0
          {
114
0
            __m256i
115
0
            vscale = _mm256_broadcastd_epi32( xscale );
116
0
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
0
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
0
            {
120
0
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
0
              __m256i
123
0
              vsrc1 = vsrc1v[col][0];
124
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
0
              vsrc1 = vsrc1v[col][1];
132
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
0
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
0
            }
137
0
          }
138
0
        }
139
0
      }
140
0
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
0
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
3.49k
  else if( trSize >= 4 )
261
3.49k
  {
262
3.49k
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
9.62k
    for( int k = 0; k < rows; k += 2 )
265
6.12k
    {
266
6.12k
            TCoeff* dstPtr  =  dst;
267
268
6.12k
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
6.12k
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
6.12k
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
6.12k
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
6.12k
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
24.2k
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
18.0k
      {
278
18.0k
        __m128i xscale = maxLoopL == 4
279
18.0k
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
18.0k
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
18.0k
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
18.0k
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
72.3k
        for( int l = 0; l < maxLoopL; l++ )
286
57.5k
        {
287
57.5k
          __m128i
288
57.5k
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
57.5k
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
115k
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
57.5k
          {
293
57.5k
            __m128i
294
57.5k
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
57.5k
            __m128i 
296
57.5k
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
57.5k
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
57.5k
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
57.5k
          }
301
57.5k
        }
302
14.7k
      }
303
6.12k
    }
304
3.49k
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
3.49k
#if USE_AVX2
310
311
3.49k
  _mm256_zeroupper();
312
3.49k
#endif
313
3.49k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
26.9k
{
68
26.9k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
26.9k
#if USE_AVX2
71
26.9k
  if( trSize >= 8 && vext >= AVX2 )
72
26.9k
  {
73
26.9k
    if( ( trSize & 15 ) == 0 )
74
0
    {
75
0
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
0
      for( int k = 0; k < rows; k += 2 )
78
0
      {
79
0
              TCoeff* dstPtr =  dst;
80
81
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
0
        __m256i vsrc1v[trLoops][2];
85
        
86
0
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
0
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
0
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
0
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
0
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
0
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
0
#endif
98
99
0
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
0
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
0
        }
102
103
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
0
        {
105
0
          __m128i xscale = maxLoopL == 4
106
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
0
          for( int l = 0; l < maxLoopL; l++ )
113
0
          {
114
0
            __m256i
115
0
            vscale = _mm256_broadcastd_epi32( xscale );
116
0
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
0
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
0
            {
120
0
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
0
              __m256i
123
0
              vsrc1 = vsrc1v[col][0];
124
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
0
              vsrc1 = vsrc1v[col][1];
132
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
0
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
0
            }
137
0
          }
138
0
        }
139
0
      }
140
0
    }
141
26.9k
    else
142
26.9k
    {
143
106k
      for( int k = 0; k < rows; k += 2 )
144
79.1k
      {
145
79.1k
              TCoeff* dstPtr  =  dst;
146
147
79.1k
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
79.1k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
79.1k
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
79.1k
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
79.1k
        __m256i vit;
154
155
79.1k
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
79.1k
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
79.1k
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
79.1k
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
79.1k
#endif
166
167
79.1k
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
79.1k
        }
169
        
170
276k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
196k
        {
172
196k
          __m128i xscale = maxLoopL == 4
173
196k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
196k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
196k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
196k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
808k
          for( int l = 0; l < maxLoopL; l++ )
180
645k
          {
181
645k
            __m256i
182
645k
            vscale = _mm256_broadcastd_epi32( xscale );
183
645k
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
1.29M
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
645k
            {
187
645k
              __m256i
188
645k
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
645k
              __m256i
190
645k
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
645k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
645k
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
645k
            }
195
645k
          }
196
162k
        }
197
79.1k
      }
198
26.9k
    }
199
26.9k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
26.9k
#if USE_AVX2
310
311
26.9k
  _mm256_zeroupper();
312
26.9k
#endif
313
26.9k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
31.4k
{
68
31.4k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
31.4k
#if USE_AVX2
71
31.4k
  if( trSize >= 8 && vext >= AVX2 )
72
31.4k
  {
73
31.4k
    if( ( trSize & 15 ) == 0 )
74
31.4k
    {
75
31.4k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
130k
      for( int k = 0; k < rows; k += 2 )
78
99.1k
      {
79
99.1k
              TCoeff* dstPtr =  dst;
80
81
99.1k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
99.1k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
99.1k
        __m256i vsrc1v[trLoops][2];
85
        
86
99.1k
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
99.1k
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
198k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
99.1k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
99.1k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
99.1k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
99.1k
#endif
98
99
99.1k
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
99.1k
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
99.1k
        }
102
103
390k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
291k
        {
105
291k
          __m128i xscale = maxLoopL == 4
106
291k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
291k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
291k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
291k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
1.20M
          for( int l = 0; l < maxLoopL; l++ )
113
960k
          {
114
960k
            __m256i
115
960k
            vscale = _mm256_broadcastd_epi32( xscale );
116
960k
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
1.92M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
960k
            {
120
960k
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
960k
              __m256i
123
960k
              vsrc1 = vsrc1v[col][0];
124
960k
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
960k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
960k
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
960k
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
960k
              vsrc1 = vsrc1v[col][1];
132
960k
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
960k
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
960k
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
960k
            }
137
960k
          }
138
241k
        }
139
99.1k
      }
140
31.4k
    }
141
18.4E
    else
142
18.4E
    {
143
18.4E
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
18.4E
    }
199
31.4k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
31.4k
#if USE_AVX2
310
311
31.4k
  _mm256_zeroupper();
312
31.4k
#endif
313
31.4k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
41.7k
{
68
41.7k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
41.7k
#if USE_AVX2
71
41.7k
  if( trSize >= 8 && vext >= AVX2 )
72
41.7k
  {
73
41.7k
    if( ( trSize & 15 ) == 0 )
74
41.7k
    {
75
41.7k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
201k
      for( int k = 0; k < rows; k += 2 )
78
159k
      {
79
159k
              TCoeff* dstPtr =  dst;
80
81
159k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
159k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
159k
        __m256i vsrc1v[trLoops][2];
85
        
86
159k
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
159k
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
478k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
319k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
319k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
319k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
319k
#endif
98
99
319k
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
319k
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
319k
        }
102
103
708k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
548k
        {
105
548k
          __m128i xscale = maxLoopL == 4
106
548k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
548k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
548k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
548k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
2.28M
          for( int l = 0; l < maxLoopL; l++ )
113
1.82M
          {
114
1.82M
            __m256i
115
1.82M
            vscale = _mm256_broadcastd_epi32( xscale );
116
1.82M
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
5.46M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
3.64M
            {
120
3.64M
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
3.64M
              __m256i
123
3.64M
              vsrc1 = vsrc1v[col][0];
124
3.64M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
3.64M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
3.64M
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
3.64M
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
3.64M
              vsrc1 = vsrc1v[col][1];
132
3.64M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
3.64M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
3.64M
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
3.64M
            }
137
1.82M
          }
138
457k
        }
139
159k
      }
140
41.7k
    }
141
18.4E
    else
142
18.4E
    {
143
18.4E
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
18.4E
    }
199
41.7k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
18.4E
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
18.4E
  else
306
18.4E
  {
307
18.4E
    THROW_FATAL( "Unsupported size" );
308
18.4E
  }
309
41.7k
#if USE_AVX2
310
311
41.7k
  _mm256_zeroupper();
312
41.7k
#endif
313
41.7k
}
void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
67
7.78k
{
68
7.78k
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
7.78k
#if USE_AVX2
71
7.78k
  if( trSize >= 8 && vext >= AVX2 )
72
7.78k
  {
73
7.78k
    if( ( trSize & 15 ) == 0 )
74
7.78k
    {
75
7.78k
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
36.8k
      for( int k = 0; k < rows; k += 2 )
78
29.0k
      {
79
29.0k
              TCoeff* dstPtr =  dst;
80
81
29.0k
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
29.0k
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
29.0k
        __m256i vsrc1v[trLoops][2];
85
        
86
29.0k
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
29.0k
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
145k
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
116k
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
116k
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
116k
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
116k
#endif
98
99
116k
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
116k
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
116k
        }
102
103
282k
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
253k
        {
105
253k
          __m128i xscale = maxLoopL == 4
106
253k
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
253k
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
253k
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
253k
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
1.08M
          for( int l = 0; l < maxLoopL; l++ )
113
870k
          {
114
870k
            __m256i
115
870k
            vscale = _mm256_broadcastd_epi32( xscale );
116
870k
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
4.35M
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
3.48M
            {
120
3.48M
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
3.48M
              __m256i
123
3.48M
              vsrc1 = vsrc1v[col][0];
124
3.48M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
3.48M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
3.48M
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
3.48M
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
3.48M
              vsrc1 = vsrc1v[col][1];
132
3.48M
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
3.48M
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
3.48M
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
3.48M
            }
137
870k
          }
138
217k
        }
139
29.0k
      }
140
7.78k
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
0
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
0
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
7.78k
  }
200
#else
201
  if( trSize >= 8 )
202
  {
203
    for( int k = 0; k < rows; k += 2 )
204
    {
205
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
      {
212
        __m128i xscale = maxLoopL == 4
213
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
        for( int l = 0; l < maxLoopL; l++ )
220
        {
221
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
          __m128i
225
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
          {
230
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
          }
255
        }
256
      }
257
    }
258
  }
259
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
7.78k
#if USE_AVX2
310
311
7.78k
  _mm256_zeroupper();
312
7.78k
#endif
313
7.78k
}
314
315
template< X86_VEXT vext, int W >
316
void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
317
60.1k
{
318
#if USE_AVX2
319
60.1k
  if( W >= 8 && vext >= AVX2 )
320
53.1k
  {
321
53.1k
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
53.1k
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
53.1k
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
373k
    while( height-- )
326
320k
    {
327
1.30M
      for( int col = 0; col < width; col += 8 )
328
988k
      {
329
988k
        __m256i
330
988k
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
988k
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
988k
        vdst = _mm256_srai_epi32( vdst, shift );
333
988k
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
988k
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
988k
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
988k
      }
337
338
320k
      dst += stride;
339
320k
    }
340
53.1k
  }
341
7.04k
  else
342
7.04k
#endif
343
7.04k
  if( W >= 4 )
344
7.04k
  {
345
7.04k
    __m128i vmin = _mm_set1_epi32( outputMin );
346
7.04k
    __m128i vmax = _mm_set1_epi32( outputMax );
347
7.04k
    __m128i vrnd = _mm_set1_epi32( round );
348
349
47.6k
    while( height-- )
350
40.6k
    {
351
81.3k
      for( int col = 0; col < width; col += 4 )
352
40.6k
      {
353
40.6k
        __m128i
354
40.6k
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
40.6k
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
40.6k
        vdst = _mm_srai_epi32 ( vdst, shift );
357
40.6k
        vdst = _mm_max_epi32  ( vdst, vmin );
358
40.6k
        vdst = _mm_min_epi32  ( vdst, vmax );
359
40.6k
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
40.6k
      }
361
362
40.6k
      dst += stride;
363
40.6k
    }
364
7.04k
  }
365
1
  else
366
1
  {
367
1
    THROW_FATAL( "Unsupported size" );
368
1
  }
369
#if USE_AVX2
370
371
60.1k
  _mm256_zeroupper();
372
60.1k
#endif
373
60.1k
}
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
317
7.04k
{
318
7.04k
#if USE_AVX2
319
7.04k
  if( W >= 8 && vext >= AVX2 )
320
0
  {
321
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
0
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
0
    while( height-- )
326
0
    {
327
0
      for( int col = 0; col < width; col += 8 )
328
0
      {
329
0
        __m256i
330
0
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
0
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
0
        vdst = _mm256_srai_epi32( vdst, shift );
333
0
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
0
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
0
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
0
      }
337
338
0
      dst += stride;
339
0
    }
340
0
  }
341
7.04k
  else
342
7.04k
#endif
343
7.04k
  if( W >= 4 )
344
7.04k
  {
345
7.04k
    __m128i vmin = _mm_set1_epi32( outputMin );
346
7.04k
    __m128i vmax = _mm_set1_epi32( outputMax );
347
7.04k
    __m128i vrnd = _mm_set1_epi32( round );
348
349
47.6k
    while( height-- )
350
40.6k
    {
351
81.3k
      for( int col = 0; col < width; col += 4 )
352
40.6k
      {
353
40.6k
        __m128i
354
40.6k
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
40.6k
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
40.6k
        vdst = _mm_srai_epi32 ( vdst, shift );
357
40.6k
        vdst = _mm_max_epi32  ( vdst, vmin );
358
40.6k
        vdst = _mm_min_epi32  ( vdst, vmax );
359
40.6k
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
40.6k
      }
361
362
40.6k
      dst += stride;
363
40.6k
    }
364
7.04k
  }
365
0
  else
366
0
  {
367
0
    THROW_FATAL( "Unsupported size" );
368
0
  }
369
7.04k
#if USE_AVX2
370
371
7.04k
  _mm256_zeroupper();
372
7.04k
#endif
373
7.04k
}
void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
317
53.1k
{
318
53.1k
#if USE_AVX2
319
53.1k
  if( W >= 8 && vext >= AVX2 )
320
53.1k
  {
321
53.1k
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
53.1k
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
53.1k
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
373k
    while( height-- )
326
320k
    {
327
1.30M
      for( int col = 0; col < width; col += 8 )
328
988k
      {
329
988k
        __m256i
330
988k
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
988k
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
988k
        vdst = _mm256_srai_epi32( vdst, shift );
333
988k
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
988k
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
988k
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
988k
      }
337
338
320k
      dst += stride;
339
320k
    }
340
53.1k
  }
341
1
  else
342
1
#endif
343
1
  if( W >= 4 )
344
0
  {
345
0
    __m128i vmin = _mm_set1_epi32( outputMin );
346
0
    __m128i vmax = _mm_set1_epi32( outputMax );
347
0
    __m128i vrnd = _mm_set1_epi32( round );
348
349
0
    while( height-- )
350
0
    {
351
0
      for( int col = 0; col < width; col += 4 )
352
0
      {
353
0
        __m128i
354
0
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
0
        vdst = _mm_srai_epi32 ( vdst, shift );
357
0
        vdst = _mm_max_epi32  ( vdst, vmin );
358
0
        vdst = _mm_min_epi32  ( vdst, vmax );
359
0
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
0
      }
361
362
0
      dst += stride;
363
0
    }
364
0
  }
365
1
  else
366
1
  {
367
1
    THROW_FATAL( "Unsupported size" );
368
1
  }
369
53.1k
#if USE_AVX2
370
371
53.1k
  _mm256_zeroupper();
372
53.1k
#endif
373
53.1k
}
374
375
template< X86_VEXT vext, int W >
376
void cpyResiClip_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
377
61.1k
{
378
#if USE_AVX2
379
61.1k
  if( W >= 16 )
380
41.5k
  {
381
41.5k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
41.5k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
41.5k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
1.04M
    while( height-- )
386
998k
    {
387
3.16M
      for( int col = 0; col < width; col += 16 )
388
2.16M
      {
389
2.16M
        __m256i
390
2.16M
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
2.16M
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
2.16M
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
2.16M
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
2.16M
        __m256i                
396
2.16M
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
2.16M
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
2.16M
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
2.16M
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
2.16M
        __m256i
402
2.16M
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
2.16M
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
2.16M
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
2.16M
      }
406
407
998k
      src += width;
408
998k
      dst += stride;
409
998k
    }
410
41.5k
  }
411
19.5k
  else
412
19.5k
#endif
413
19.5k
  if( W >= 8 )
414
13.1k
  {
415
13.1k
    __m128i vmin = _mm_set1_epi32( outputMin );
416
13.1k
    __m128i vmax = _mm_set1_epi32( outputMax );
417
13.1k
    __m128i vrnd = _mm_set1_epi32( round );
418
419
196k
    while( height-- )
420
183k
    {
421
367k
      for( int col = 0; col < width; col += 8 )
422
183k
      {
423
183k
        __m128i
424
183k
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
183k
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
183k
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
183k
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
183k
        __m128i                
429
183k
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
183k
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
183k
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
183k
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
183k
        __m128i
434
183k
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
183k
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
183k
      }
437
438
183k
      src += width;
439
183k
      dst += stride;
440
183k
    }
441
13.1k
  }
442
6.41k
  else if( W >= 4 )
443
6.41k
  {
444
6.41k
    __m128i vmin = _mm_set1_epi32( outputMin );
445
6.41k
    __m128i vmax = _mm_set1_epi32( outputMax );
446
6.41k
    __m128i vrnd = _mm_set1_epi32( round );
447
448
6.41k
    __m128i vzero = _mm_setzero_si128();
449
6.41k
    __m128i vdst;
450
451
91.1k
    while( height-- )
452
84.7k
    {
453
169k
      for( int col = 0; col < width; col += 4 )
454
84.7k
      {
455
84.7k
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
84.7k
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
84.7k
        vdst = _mm_srai_epi32 ( vdst, shift );
458
84.7k
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
84.7k
        vdst = _mm_packs_epi32( vdst, vzero );
460
84.7k
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
84.7k
      }
462
463
84.7k
      src += width;
464
84.7k
      dst += stride;
465
84.7k
    }
466
6.41k
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
61.1k
}
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
6.41k
{
378
6.41k
#if USE_AVX2
379
6.41k
  if( W >= 16 )
380
0
  {
381
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
0
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
0
    while( height-- )
386
0
    {
387
0
      for( int col = 0; col < width; col += 16 )
388
0
      {
389
0
        __m256i
390
0
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
0
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
0
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
0
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
0
        __m256i                
396
0
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
0
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
0
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
0
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
0
        __m256i
402
0
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
0
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
0
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
0
      }
406
407
0
      src += width;
408
0
      dst += stride;
409
0
    }
410
0
  }
411
6.41k
  else
412
6.41k
#endif
413
6.41k
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
6.41k
  else if( W >= 4 )
443
6.41k
  {
444
6.41k
    __m128i vmin = _mm_set1_epi32( outputMin );
445
6.41k
    __m128i vmax = _mm_set1_epi32( outputMax );
446
6.41k
    __m128i vrnd = _mm_set1_epi32( round );
447
448
6.41k
    __m128i vzero = _mm_setzero_si128();
449
6.41k
    __m128i vdst;
450
451
91.1k
    while( height-- )
452
84.7k
    {
453
169k
      for( int col = 0; col < width; col += 4 )
454
84.7k
      {
455
84.7k
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
84.7k
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
84.7k
        vdst = _mm_srai_epi32 ( vdst, shift );
458
84.7k
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
84.7k
        vdst = _mm_packs_epi32( vdst, vzero );
460
84.7k
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
84.7k
      }
462
463
84.7k
      src += width;
464
84.7k
      dst += stride;
465
84.7k
    }
466
6.41k
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
6.41k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
13.1k
{
378
13.1k
#if USE_AVX2
379
13.1k
  if( W >= 16 )
380
0
  {
381
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
0
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
0
    while( height-- )
386
0
    {
387
0
      for( int col = 0; col < width; col += 16 )
388
0
      {
389
0
        __m256i
390
0
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
0
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
0
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
0
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
0
        __m256i                
396
0
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
0
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
0
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
0
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
0
        __m256i
402
0
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
0
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
0
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
0
      }
406
407
0
      src += width;
408
0
      dst += stride;
409
0
    }
410
0
  }
411
13.1k
  else
412
13.1k
#endif
413
13.1k
  if( W >= 8 )
414
13.1k
  {
415
13.1k
    __m128i vmin = _mm_set1_epi32( outputMin );
416
13.1k
    __m128i vmax = _mm_set1_epi32( outputMax );
417
13.1k
    __m128i vrnd = _mm_set1_epi32( round );
418
419
196k
    while( height-- )
420
183k
    {
421
367k
      for( int col = 0; col < width; col += 8 )
422
183k
      {
423
183k
        __m128i
424
183k
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
183k
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
183k
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
183k
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
183k
        __m128i                
429
183k
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
183k
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
183k
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
183k
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
183k
        __m128i
434
183k
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
183k
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
183k
      }
437
438
183k
      src += width;
439
183k
      dst += stride;
440
183k
    }
441
13.1k
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
13.1k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
16.2k
{
378
16.2k
#if USE_AVX2
379
16.2k
  if( W >= 16 )
380
16.2k
  {
381
16.2k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
16.2k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
16.2k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
289k
    while( height-- )
386
273k
    {
387
546k
      for( int col = 0; col < width; col += 16 )
388
273k
      {
389
273k
        __m256i
390
273k
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
273k
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
273k
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
273k
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
273k
        __m256i                
396
273k
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
273k
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
273k
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
273k
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
273k
        __m256i
402
273k
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
273k
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
273k
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
273k
      }
406
407
273k
      src += width;
408
273k
      dst += stride;
409
273k
    }
410
16.2k
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
16.2k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
21.4k
{
378
21.4k
#if USE_AVX2
379
21.4k
  if( W >= 16 )
380
21.4k
  {
381
21.4k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
21.4k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
21.4k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
525k
    while( height-- )
386
504k
    {
387
1.51M
      for( int col = 0; col < width; col += 16 )
388
1.00M
      {
389
1.00M
        __m256i
390
1.00M
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
1.00M
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
1.00M
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
1.00M
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
1.00M
        __m256i                
396
1.00M
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
1.00M
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
1.00M
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
1.00M
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
1.00M
        __m256i
402
1.00M
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
1.00M
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
1.00M
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
1.00M
      }
406
407
504k
      src += width;
408
504k
      dst += stride;
409
504k
    }
410
21.4k
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
21.4k
}
void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Line
Count
Source
377
3.80k
{
378
3.80k
#if USE_AVX2
379
3.80k
  if( W >= 16 )
380
3.80k
  {
381
3.80k
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
3.80k
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
3.80k
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
225k
    while( height-- )
386
221k
    {
387
1.10M
      for( int col = 0; col < width; col += 16 )
388
886k
      {
389
886k
        __m256i
390
886k
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
886k
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
886k
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
886k
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
886k
        __m256i                
396
886k
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
886k
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
886k
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
886k
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
886k
        __m256i
402
886k
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
886k
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
886k
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
886k
      }
406
407
221k
      src += width;
408
221k
      dst += stride;
409
221k
    }
410
3.80k
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
3.80k
}
472
473
template<X86_VEXT vext>
474
static void simdInvLfnstNxNCore( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize )
475
18.8k
{
476
18.8k
  CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" );
477
478
18.8k
  static constexpr int maxLog2TrDynamicRange = 15;
479
18.8k
  const TCoeff    outputMinimum = -( 1 << maxLog2TrDynamicRange );
480
18.8k
  const TCoeff    outputMaximum =  ( 1 << maxLog2TrDynamicRange ) - 1;
481
18.8k
  const int8_t*   trMat         = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0];
482
18.8k
  const int       trSize        = ( size > 4 ) ? 48 : 16;
483
18.8k
  int*            out           = dst;
484
485
18.8k
  const __m128i vzero = _mm_setzero_si128();
486
18.8k
  const __m128i vmin  = _mm_set1_epi32( outputMinimum );
487
18.8k
  const __m128i vmax  = _mm_set1_epi32( outputMaximum );
488
489
202k
  for( int j = 0; j < trSize; j += 4, out += 4 )
490
183k
  {
491
183k
    __m128i       vsum[4];
492
493
915k
    for( int k = 0; k < 4; k++, trMat += 16 )
494
732k
    {
495
732k
      const int8_t* trMatTmp = trMat;
496
732k
      int* srcPtr = src;
497
498
732k
      __m128i vsrc;
499
732k
      __m128i vtr;
500
732k
      __m128i vtmp;
501
732k
      __m128i vcur = vzero;
502
503
2.10M
      for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 )
504
1.37M
      {
505
1.37M
        vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr );
506
1.37M
        vtr  = _mm_loadu_si64( ( const __m128i* ) trMatTmp );
507
1.37M
        vtr  = _mm_cvtepi8_epi16( vtr );
508
1.37M
        vtmp = _mm_cvtepi16_epi32( vtr );
509
510
1.37M
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
511
1.37M
        vcur = _mm_add_epi32( vtmp, vcur );
512
513
1.37M
        vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
514
1.37M
        vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) );
515
      
516
1.37M
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
517
1.37M
        vcur = _mm_add_epi32( vtmp, vcur );
518
1.37M
      }
519
520
732k
      vsum[k] = vcur;
521
732k
    }
522
523
183k
    __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) );
524
183k
    vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) );
525
183k
    vout = _mm_srai_epi32( vout, 7 );
526
183k
    vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax );
527
528
183k
    _mm_storeu_si128( ( __m128i* ) out, vout );
529
183k
  }
530
18.8k
}
Unexecuted instantiation: Trafo_sse41.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Trafo_avx2.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
475
18.8k
{
476
18.8k
  CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" );
477
478
18.8k
  static constexpr int maxLog2TrDynamicRange = 15;
479
18.8k
  const TCoeff    outputMinimum = -( 1 << maxLog2TrDynamicRange );
480
18.8k
  const TCoeff    outputMaximum =  ( 1 << maxLog2TrDynamicRange ) - 1;
481
18.8k
  const int8_t*   trMat         = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0];
482
18.8k
  const int       trSize        = ( size > 4 ) ? 48 : 16;
483
18.8k
  int*            out           = dst;
484
485
18.8k
  const __m128i vzero = _mm_setzero_si128();
486
18.8k
  const __m128i vmin  = _mm_set1_epi32( outputMinimum );
487
18.8k
  const __m128i vmax  = _mm_set1_epi32( outputMaximum );
488
489
202k
  for( int j = 0; j < trSize; j += 4, out += 4 )
490
183k
  {
491
183k
    __m128i       vsum[4];
492
493
915k
    for( int k = 0; k < 4; k++, trMat += 16 )
494
732k
    {
495
732k
      const int8_t* trMatTmp = trMat;
496
732k
      int* srcPtr = src;
497
498
732k
      __m128i vsrc;
499
732k
      __m128i vtr;
500
732k
      __m128i vtmp;
501
732k
      __m128i vcur = vzero;
502
503
2.10M
      for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 )
504
1.37M
      {
505
1.37M
        vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr );
506
1.37M
        vtr  = _mm_loadu_si64( ( const __m128i* ) trMatTmp );
507
1.37M
        vtr  = _mm_cvtepi8_epi16( vtr );
508
1.37M
        vtmp = _mm_cvtepi16_epi32( vtr );
509
510
1.37M
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
511
1.37M
        vcur = _mm_add_epi32( vtmp, vcur );
512
513
1.37M
        vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
514
1.37M
        vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) );
515
      
516
1.37M
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
517
1.37M
        vcur = _mm_add_epi32( vtmp, vcur );
518
1.37M
      }
519
520
732k
      vsum[k] = vcur;
521
732k
    }
522
523
183k
    __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) );
524
183k
    vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) );
525
183k
    vout = _mm_srai_epi32( vout, 7 );
526
183k
    vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax );
527
528
183k
    _mm_storeu_si128( ( __m128i* ) out, vout );
529
183k
  }
530
18.8k
}
531
532
template<X86_VEXT vext>
533
void TCoeffOps::_initTCoeffOpsX86()
534
5.61k
{
535
5.61k
  cpyResiClip[2] = cpyResiClip_SSE<vext,  4>;
536
5.61k
  cpyResiClip[3] = cpyResiClip_SSE<vext,  8>;
537
5.61k
  cpyResiClip[4] = cpyResiClip_SSE<vext, 16>;
538
5.61k
  cpyResiClip[5] = cpyResiClip_SSE<vext, 32>;
539
5.61k
  cpyResiClip[6] = cpyResiClip_SSE<vext, 64>;
540
5.61k
  roundClip4     = roundClip_SSE<vext,  4>;
541
5.61k
  roundClip8     = roundClip_SSE<vext,  8>;
542
5.61k
  fastInvCore[0] = fastInv_SSE  <vext,  4>;
543
5.61k
  fastInvCore[1] = fastInv_SSE  <vext,  8>;
544
5.61k
  fastInvCore[2] = fastInv_SSE  <vext, 16>;
545
5.61k
  fastInvCore[3] = fastInv_SSE  <vext, 32>;
546
5.61k
  fastInvCore[4] = fastInv_SSE  <vext, 64>;
547
5.61k
}
Unexecuted instantiation: void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
534
5.61k
{
535
5.61k
  cpyResiClip[2] = cpyResiClip_SSE<vext,  4>;
536
5.61k
  cpyResiClip[3] = cpyResiClip_SSE<vext,  8>;
537
5.61k
  cpyResiClip[4] = cpyResiClip_SSE<vext, 16>;
538
5.61k
  cpyResiClip[5] = cpyResiClip_SSE<vext, 32>;
539
5.61k
  cpyResiClip[6] = cpyResiClip_SSE<vext, 64>;
540
5.61k
  roundClip4     = roundClip_SSE<vext,  4>;
541
5.61k
  roundClip8     = roundClip_SSE<vext,  8>;
542
5.61k
  fastInvCore[0] = fastInv_SSE  <vext,  4>;
543
5.61k
  fastInvCore[1] = fastInv_SSE  <vext,  8>;
544
5.61k
  fastInvCore[2] = fastInv_SSE  <vext, 16>;
545
5.61k
  fastInvCore[3] = fastInv_SSE  <vext, 32>;
546
5.61k
  fastInvCore[4] = fastInv_SSE  <vext, 64>;
547
5.61k
}
548
549
template<X86_VEXT vext>
550
void TrQuant::_initTrQuantX86()
551
59.8k
{
552
59.8k
  m_invLfnstNxN = simdInvLfnstNxNCore<vext>;
553
59.8k
}
Unexecuted instantiation: void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
551
59.8k
{
552
59.8k
  m_invLfnstNxN = simdInvLfnstNxNCore<vext>;
553
59.8k
}
554
555
template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>();
556
template void TrQuant::_initTrQuantX86<SIMDX86>();
557
558
#endif // TARGET_SIMD_X86
559
#endif
560
561
}