Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/TrafoX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     TrafoX86.h
44
    \brief    SIMD trafo
45
*/
46
47
//! \ingroup CommonLib
48
//! \{
49
50
51
#include "CommonLib/CommonDef.h"
52
#include "CommonLib/Rom.h"
53
54
#include "CommonDefX86.h"
55
56
#include "TrQuant.h"
57
#include "TrQuant_EMT.h"
58
59
namespace vvdec
60
{
61
62
#if ENABLE_SIMD_TCOEFF_OPS
63
#ifdef TARGET_SIMD_X86
64
65
template< X86_VEXT vext, int trSize >
66
void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
67
0
{
68
0
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
69
70
#if USE_AVX2
71
0
  if( trSize >= 8 && vext >= AVX2 )
72
0
  {
73
0
    if( ( trSize & 15 ) == 0 )
74
0
    {
75
0
      static constexpr unsigned trLoops = trSize >= 16 ? trSize >> 4 : 1;
76
77
0
      for( int k = 0; k < rows; k += 2 )
78
0
      {
79
0
              TCoeff* dstPtr =  dst;
80
81
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
82
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
83
84
0
        __m256i vsrc1v[trLoops][2];
85
        
86
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
87
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
88
89
0
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
90
0
        {
91
#if defined( _MSC_VER ) && _MSC_VER > 1900
92
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
93
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94
#else
95
0
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
96
0
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97
0
#endif
98
99
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
100
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
101
        }
102
103
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
104
0
        {
105
0
          __m128i xscale = maxLoopL == 4
106
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
107
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
108
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
109
110
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
111
112
0
          for( int l = 0; l < maxLoopL; l++ )
113
0
          {
114
0
            __m256i
115
0
            vscale = _mm256_broadcastd_epi32( xscale );
116
0
            xscale = _mm_bsrli_si128( xscale, 4 );
117
118
0
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
119
0
            {
120
0
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
121
122
0
              __m256i
123
0
              vsrc1 = vsrc1v[col][0];
124
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
125
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
126
127
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
128
            
129
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
130
131
0
              vsrc1 = vsrc1v[col][1];
132
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
133
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
134
135
0
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
136
0
            }
137
0
          }
138
0
        }
139
0
      }
140
0
    }
141
0
    else
142
0
    {
143
0
      for( int k = 0; k < rows; k += 2 )
144
0
      {
145
0
              TCoeff* dstPtr  =  dst;
146
147
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
148
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
149
150
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
151
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
152
153
0
        __m256i vit;
154
155
0
        {
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
#if defined( _MSC_VER ) && _MSC_VER > 1900
162
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
163
#else
164
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
165
0
#endif
166
167
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
168
        }
169
        
170
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
171
0
        {
172
0
          __m128i xscale = maxLoopL == 4
173
0
                         ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
174
0
                         : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
175
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
176
177
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
178
179
0
          for( int l = 0; l < maxLoopL; l++ )
180
0
          {
181
0
            __m256i
182
0
            vscale = _mm256_broadcastd_epi32( xscale );
183
0
            xscale = _mm_bsrli_si128( xscale, 4 );
184
185
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
186
0
            {
187
0
              __m256i
188
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
189
0
              __m256i
190
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
191
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
192
193
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
194
0
            }
195
0
          }
196
0
        }
197
0
      }
198
0
    }
199
0
  }
200
#else
201
0
  if( trSize >= 8 )
202
0
  {
203
0
    for( int k = 0; k < rows; k += 2 )
204
0
    {
205
0
            TCoeff* dstPtr  =  dst;
206
207
      const TCoeff* srcPtr0 = &src[ k      * lines];
208
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
209
        
210
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
211
0
      {
212
0
        __m128i xscale = maxLoopL == 4
213
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
214
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
215
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
216
217
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
218
219
0
        for( int l = 0; l < maxLoopL; l++ )
220
0
        {
221
0
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
222
0
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
223
224
0
          __m128i
225
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
226
0
          xscale = _mm_bsrli_si128( xscale, 4 );
227
228
0
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
229
0
          {
230
0
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
231
#if defined( _MSC_VER ) && _MSC_VER > 1900
232
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
233
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
234
#else
235
0
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
236
0
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
237
0
#endif
238
239
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
240
241
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
242
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
243
244
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
245
          
246
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
247
          
248
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
249
250
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
251
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
252
          
253
0
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
254
0
          }
255
0
        }
256
0
      }
257
0
    }
258
0
  }
259
0
#endif
260
0
  else if( trSize >= 4 )
261
0
  {
262
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
263
264
0
    for( int k = 0; k < rows; k += 2 )
265
0
    {
266
0
            TCoeff* dstPtr  =  dst;
267
268
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
269
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
270
271
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
272
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
273
274
0
      __m128i vit = _mm_unpacklo_epi16( _mm_loadu_si64( ( const __m128i * ) itPtr0 ), _mm_loadu_si64( ( const __m128i * ) itPtr1 ) );
275
 
276
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
277
0
      {
278
0
        __m128i xscale = maxLoopL == 4
279
0
                        ? _mm_packs_epi32( _mm_load_si128( ( const __m128i* )srcPtr0 ), _mm_load_si128( ( const __m128i* )srcPtr1 ) )
280
0
                        : _mm_packs_epi32( _mm_loadu_si64( ( const __m128i* )srcPtr0 ), _mm_loadu_si64( ( const __m128i* )srcPtr1 ) );
281
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
282
283
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
284
285
0
        for( int l = 0; l < maxLoopL; l++ )
286
0
        {
287
0
          __m128i
288
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
289
0
          xscale = _mm_bsrli_si128( xscale, 4 );
290
291
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
292
0
          {
293
0
            __m128i
294
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
295
0
            __m128i 
296
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
297
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
298
299
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
300
0
          }
301
0
        }
302
0
      }
303
0
    }
304
0
  }
305
0
  else
306
0
  {
307
0
    THROW_FATAL( "Unsupported size" );
308
0
  }
309
#if USE_AVX2
310
311
0
  _mm256_zeroupper();
312
0
#endif
313
0
}
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvdec::fastInv_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
314
315
template< X86_VEXT vext, int W >
316
void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
317
0
{
318
#if USE_AVX2
319
0
  if( W >= 8 && vext >= AVX2 )
320
0
  {
321
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
322
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
323
0
    __m256i vrnd = _mm256_set1_epi32( round );
324
325
0
    while( height-- )
326
0
    {
327
0
      for( int col = 0; col < width; col += 8 )
328
0
      {
329
0
        __m256i
330
0
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
331
0
        vdst = _mm256_add_epi32 ( vdst, vrnd );
332
0
        vdst = _mm256_srai_epi32( vdst, shift );
333
0
        vdst = _mm256_max_epi32 ( vdst, vmin );
334
0
        vdst = _mm256_min_epi32 ( vdst, vmax );
335
0
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
336
0
      }
337
338
0
      dst += stride;
339
0
    }
340
0
  }
341
0
  else
342
0
#endif
343
0
  if( W >= 4 )
344
0
  {
345
0
    __m128i vmin = _mm_set1_epi32( outputMin );
346
0
    __m128i vmax = _mm_set1_epi32( outputMax );
347
0
    __m128i vrnd = _mm_set1_epi32( round );
348
349
0
    while( height-- )
350
0
    {
351
0
      for( int col = 0; col < width; col += 4 )
352
0
      {
353
0
        __m128i
354
0
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
355
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
356
0
        vdst = _mm_srai_epi32 ( vdst, shift );
357
0
        vdst = _mm_max_epi32  ( vdst, vmin );
358
0
        vdst = _mm_min_epi32  ( vdst, vmax );
359
0
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
360
0
      }
361
362
0
      dst += stride;
363
0
    }
364
0
  }
365
0
  else
366
0
  {
367
0
    THROW_FATAL( "Unsupported size" );
368
0
  }
369
#if USE_AVX2
370
371
0
  _mm256_zeroupper();
372
0
#endif
373
0
}
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::roundClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
374
375
template< X86_VEXT vext, int W >
376
void cpyResiClip_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
377
0
{
378
#if USE_AVX2
379
0
  if( W >= 16 )
380
0
  {
381
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
382
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
383
0
    __m256i vrnd = _mm256_set1_epi32( round );
384
385
0
    while( height-- )
386
0
    {
387
0
      for( int col = 0; col < width; col += 16 )
388
0
      {
389
0
        __m256i
390
0
        vsrc1 = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
391
0
        vsrc1 = _mm256_add_epi32  ( vsrc1, vrnd );
392
0
        vsrc1 = _mm256_srai_epi32 ( vsrc1, shift );
393
0
        vsrc1 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc1, vmax ), vmin );
394
395
0
        __m256i                
396
0
        vsrc2 = _mm256_load_si256 ( ( const __m256i * ) &src[col+8] );
397
0
        vsrc2 = _mm256_add_epi32  ( vsrc2, vrnd );
398
0
        vsrc2 = _mm256_srai_epi32 ( vsrc2, shift );
399
0
        vsrc2 = _mm256_max_epi32  ( _mm256_min_epi32( vsrc2, vmax ), vmin );
400
401
0
        __m256i
402
0
        vdst  = _mm256_packs_epi32( vsrc1, vsrc2 );
403
0
        vdst  = _mm256_permute4x64_epi64( vdst, ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
404
0
        _mm256_storeu_si256       ( ( __m256i * ) &dst[col], vdst );
405
0
      }
406
407
0
      src += width;
408
0
      dst += stride;
409
0
    }
410
0
  }
411
0
  else
412
0
#endif
413
0
  if( W >= 8 )
414
0
  {
415
0
    __m128i vmin = _mm_set1_epi32( outputMin );
416
0
    __m128i vmax = _mm_set1_epi32( outputMax );
417
0
    __m128i vrnd = _mm_set1_epi32( round );
418
419
0
    while( height-- )
420
0
    {
421
0
      for( int col = 0; col < width; col += 8 )
422
0
      {
423
0
        __m128i
424
0
        vsrc1 = _mm_load_si128 ( ( const __m128i * ) &src[col] );
425
0
        vsrc1 = _mm_add_epi32  ( vsrc1, vrnd );
426
0
        vsrc1 = _mm_srai_epi32 ( vsrc1, shift );
427
0
        vsrc1 = _mm_max_epi32  ( _mm_min_epi32( vsrc1, vmax ), vmin );
428
0
        __m128i                
429
0
        vsrc2 = _mm_load_si128 ( ( const __m128i * ) &src[col+4] );
430
0
        vsrc2 = _mm_add_epi32  ( vsrc2, vrnd );
431
0
        vsrc2 = _mm_srai_epi32 ( vsrc2, shift );
432
0
        vsrc2 = _mm_max_epi32  ( _mm_min_epi32( vsrc2, vmax ), vmin );
433
0
        __m128i
434
0
        vdst  = _mm_packs_epi32( vsrc1, vsrc2 );
435
0
        _mm_storeu_si128       ( ( __m128i * ) &dst[col], vdst );
436
0
      }
437
438
0
      src += width;
439
0
      dst += stride;
440
0
    }
441
0
  }
442
0
  else if( W >= 4 )
443
0
  {
444
0
    __m128i vmin = _mm_set1_epi32( outputMin );
445
0
    __m128i vmax = _mm_set1_epi32( outputMax );
446
0
    __m128i vrnd = _mm_set1_epi32( round );
447
448
0
    __m128i vzero = _mm_setzero_si128();
449
0
    __m128i vdst;
450
451
0
    while( height-- )
452
0
    {
453
0
      for( int col = 0; col < width; col += 4 )
454
0
      {
455
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
456
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
457
0
        vdst = _mm_srai_epi32 ( vdst, shift );
458
0
        vdst = _mm_max_epi32  ( _mm_min_epi32( vdst, vmax ), vmin );
459
0
        vdst = _mm_packs_epi32( vdst, vzero );
460
0
        _mm_storeu_si64       ( ( __m128i * ) &dst[col], vdst );
461
0
      }
462
463
0
      src += width;
464
0
      dst += stride;
465
0
    }
466
0
  }
467
0
  else
468
0
  {
469
0
    THROW_FATAL( "Unsupported size" );
470
0
  }
471
0
}
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)1, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 32>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvdec::cpyResiClip_SSE<(vvdec::x86_simd::X86_VEXT)4, 64>(int const*, short*, long, unsigned int, unsigned int, int, int, int, int)
472
473
template<X86_VEXT vext>
474
static void simdInvLfnstNxNCore( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize )
475
0
{
476
0
  CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" );
477
478
0
  static constexpr int maxLog2TrDynamicRange = 15;
479
0
  const TCoeff    outputMinimum = -( 1 << maxLog2TrDynamicRange );
480
0
  const TCoeff    outputMaximum =  ( 1 << maxLog2TrDynamicRange ) - 1;
481
0
  const int8_t*   trMat         = ( size > 4 ) ? g_lfnst8x8[mode][index][0] : g_lfnst4x4[mode][index][0];
482
0
  const int       trSize        = ( size > 4 ) ? 48 : 16;
483
0
  int*            out           = dst;
484
485
0
  const __m128i vzero = _mm_setzero_si128();
486
0
  const __m128i vmin  = _mm_set1_epi32( outputMinimum );
487
0
  const __m128i vmax  = _mm_set1_epi32( outputMaximum );
488
489
0
  for( int j = 0; j < trSize; j += 4, out += 4 )
490
0
  {
491
0
    __m128i       vsum[4];
492
493
0
    for( int k = 0; k < 4; k++, trMat += 16 )
494
0
    {
495
0
      const int8_t* trMatTmp = trMat;
496
0
      int* srcPtr = src;
497
498
0
      __m128i vsrc;
499
0
      __m128i vtr;
500
0
      __m128i vtmp;
501
0
      __m128i vcur = vzero;
502
503
0
      for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 )
504
0
      {
505
0
        vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr );
506
0
        vtr  = _mm_loadu_si64( ( const __m128i* ) trMatTmp );
507
0
        vtr  = _mm_cvtepi8_epi16( vtr );
508
0
        vtmp = _mm_cvtepi16_epi32( vtr );
509
510
0
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
511
0
        vcur = _mm_add_epi32( vtmp, vcur );
512
513
0
        vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
514
0
        vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) );
515
      
516
0
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
517
0
        vcur = _mm_add_epi32( vtmp, vcur );
518
0
      }
519
520
0
      vsum[k] = vcur;
521
0
    }
522
523
0
    __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) );
524
0
    vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) );
525
0
    vout = _mm_srai_epi32( vout, 7 );
526
0
    vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax );
527
528
0
    _mm_storeu_si128( ( __m128i* ) out, vout );
529
0
  }
530
0
}
Unexecuted instantiation: Trafo_sse41.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: Trafo_avx2.cpp:void vvdec::simdInvLfnstNxNCore<(vvdec::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int)
531
532
template<X86_VEXT vext>
533
void TCoeffOps::_initTCoeffOpsX86()
534
0
{
535
0
  cpyResiClip[2] = cpyResiClip_SSE<vext,  4>;
536
0
  cpyResiClip[3] = cpyResiClip_SSE<vext,  8>;
537
0
  cpyResiClip[4] = cpyResiClip_SSE<vext, 16>;
538
0
  cpyResiClip[5] = cpyResiClip_SSE<vext, 32>;
539
0
  cpyResiClip[6] = cpyResiClip_SSE<vext, 64>;
540
0
  roundClip4     = roundClip_SSE<vext,  4>;
541
0
  roundClip8     = roundClip_SSE<vext,  8>;
542
0
  fastInvCore[0] = fastInv_SSE  <vext,  4>;
543
0
  fastInvCore[1] = fastInv_SSE  <vext,  8>;
544
0
  fastInvCore[2] = fastInv_SSE  <vext, 16>;
545
0
  fastInvCore[3] = fastInv_SSE  <vext, 32>;
546
0
  fastInvCore[4] = fastInv_SSE  <vext, 64>;
547
0
}
Unexecuted instantiation: void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::TCoeffOps::_initTCoeffOpsX86<(vvdec::x86_simd::X86_VEXT)4>()
548
549
template<X86_VEXT vext>
550
void TrQuant::_initTrQuantX86()
551
0
{
552
0
  m_invLfnstNxN = simdInvLfnstNxNCore<vext>;
553
0
}
Unexecuted instantiation: void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::TrQuant::_initTrQuantX86<(vvdec::x86_simd::X86_VEXT)4>()
554
555
template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>();
556
template void TrQuant::_initTrQuantX86<SIMDX86>();
557
558
#endif // TARGET_SIMD_X86
559
#endif
560
561
}