Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/TrafoX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     TrafoX86.h
43
    \brief    SIMD averaging.
44
*/
45
46
//! \ingroup CommonLib
47
//! \{
48
49
#include "CommonLib/CommonDef.h"
50
#include "CommonDefX86.h"
51
52
#include "TrQuant.h"
53
#include "TrQuant_EMT.h"
54
55
#if ENABLE_SIMD_TRAFO
56
#ifdef TARGET_SIMD_X86
57
58
namespace vvenc {
59
60
template<X86_VEXT vext, unsigned trSize>
61
void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
62
0
{
63
0
  unsigned maxLoopL = std::min<int>( reducedLines, 4 );
64
65
#if USE_AVX2
66
0
  if( trSize >= 8 && vext >= AVX2 )
67
0
  {
68
0
    if( ( trSize & 15 ) == 0 )
69
0
    {
70
0
      static constexpr unsigned trLoops = trSize >> 4 ? trSize >> 4 : 1;
71
72
0
      for( int k = 0; k < rows; k += 2 )
73
0
      {
74
0
              TCoeff* dstPtr =  dst;
75
76
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
77
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
78
79
0
        __m256i vsrc1v[trLoops][2];
80
        
81
        const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
82
        const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
83
84
0
        for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
85
0
        {
86
#if defined( _MSC_VER ) && _MSC_VER > 1900
87
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
88
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
89
#else
90
0
          __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
91
0
          __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
92
0
#endif
93
94
          vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
95
          vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
96
        }
97
98
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
99
0
        {
100
0
          __m128i xscale = maxLoopL == 4
101
0
                         ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) )
102
0
                         : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) );
103
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
104
105
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
106
107
0
          for( int l = 0; l < maxLoopL; l++ )
108
0
          {
109
0
            __m256i
110
0
            vscale = _mm256_broadcastd_epi32( xscale );
111
0
            xscale = _mm_bsrli_si128( xscale, 4 );
112
113
0
            for( int col = 0; col < trLoops; col++, dstPtr += 16 )
114
0
            {
115
0
              __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
116
117
0
              __m256i
118
0
              vsrc1 = vsrc1v[col][0];
119
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
120
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
121
122
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
123
            
124
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
125
126
0
              vsrc1 = vsrc1v[col][1];
127
0
              vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
128
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
129
130
0
              _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
131
0
            }
132
0
          }
133
0
        }
134
0
      }
135
0
    }
136
0
    else
137
0
    {
138
0
      for( int k = 0; k < rows; k += 2 )
139
0
      {
140
0
              TCoeff* dstPtr  =  dst;
141
142
0
        const TCoeff* srcPtr0 = &src[ k      * lines];
143
0
        const TCoeff* srcPtr1 = &src[(k + 1) * lines];
144
145
0
        const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
146
0
        const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
147
148
0
        __m256i vit;
149
150
0
        {
151
#if defined( _MSC_VER ) && _MSC_VER > 1900
152
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
153
#else
154
0
          __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
155
0
#endif
156
#if defined( _MSC_VER ) && _MSC_VER > 1900
157
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
158
#else
159
0
          __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
160
0
#endif
161
162
          vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
163
        }
164
        
165
0
        for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
166
0
        {
167
0
          __m128i xscale = maxLoopL == 4
168
0
                         ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) )
169
0
                         : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) );
170
0
          xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
171
172
0
          if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
173
174
0
          for( int l = 0; l < maxLoopL; l++ )
175
0
          {
176
0
            __m256i
177
0
            vscale = _mm256_broadcastd_epi32( xscale );
178
0
            xscale = _mm_bsrli_si128( xscale, 4 );
179
180
0
            for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
181
0
            {
182
0
              __m256i
183
0
              vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
184
0
              __m256i
185
0
              vsrc1 = _mm256_madd_epi16    ( vit, vscale );
186
0
              vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
187
188
0
              _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
189
0
            }
190
0
          }
191
0
        }
192
0
      }
193
0
    }
194
0
  }
195
#else
196
0
  if( trSize >= 8 )
197
0
  {
198
0
    for( int k = 0; k < rows; k += 2 )
199
0
    {
200
0
            TCoeff* dstPtr  =  dst;
201
202
      const TCoeff* srcPtr0 = &src[ k      * lines];
203
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
204
        
205
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
206
0
      {
207
0
        __m128i xscale = maxLoopL == 4
208
0
                        ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) )
209
0
                        : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) );
210
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
211
212
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
213
214
0
        for( int l = 0; l < maxLoopL; l++ )
215
0
        {
216
0
          const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
217
0
          const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
218
219
0
          __m128i
220
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
221
0
          xscale = _mm_bsrli_si128( xscale, 4 );
222
223
0
          for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
224
0
          {
225
0
            __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
226
#if defined( _MSC_VER ) && _MSC_VER > 1900
227
            __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
228
            __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
229
#else
230
0
            __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
231
0
            __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
232
0
#endif
233
234
            __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
235
236
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
237
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
238
239
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
240
          
241
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
242
          
243
            vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
244
245
            vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
246
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
247
          
248
0
            _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
249
0
          }
250
0
        }
251
0
      }
252
0
    }
253
0
  }
254
0
#endif
255
0
  else if( trSize >= 4 )
256
0
  {
257
0
    CHECKD( trSize != 4, "trSize needs to be '4'!" );
258
259
0
    for( int k = 0; k < rows; k += 2 )
260
0
    {
261
0
            TCoeff* dstPtr  =  dst;
262
263
0
      const TCoeff* srcPtr0 = &src[ k      * lines];
264
0
      const TCoeff* srcPtr1 = &src[(k + 1) * lines];
265
266
0
      const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
267
0
      const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
268
269
0
      __m128i vit = _mm_unpacklo_epi16( _vv_loadl_epi64( ( const __m128i * ) itPtr0 ), _vv_loadl_epi64( ( const __m128i * ) itPtr1 ) );
270
 
271
0
      for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
272
0
      {
273
0
        __m128i xscale = maxLoopL == 4
274
0
                        ? _mm_packs_epi32( _mm_loadu_si128( ( const __m128i* )srcPtr0 ), _mm_loadu_si128( ( const __m128i* )srcPtr1 ) )
275
0
                        : _mm_packs_epi32( _vv_loadl_epi64( ( const __m128i* )srcPtr0 ), _vv_loadl_epi64( ( const __m128i* )srcPtr1 ) );
276
0
        xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
277
278
0
        if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
279
280
0
        for( int l = 0; l < maxLoopL; l++ )
281
0
        {
282
0
          __m128i
283
0
          vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
284
0
          xscale = _mm_bsrli_si128( xscale, 4 );
285
286
0
          for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
287
0
          {
288
0
            __m128i
289
0
            vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
290
0
            __m128i 
291
0
            vsrc1 = _mm_madd_epi16 ( vit, vscale );
292
0
            vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
293
294
0
            _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
295
0
          }
296
0
        }
297
0
      }
298
0
    }
299
0
  }
300
0
  else
301
0
  {
302
0
    THROW( "Unsupported size" );
303
0
  }
304
#if USE_AVX2
305
306
0
  _mm256_zeroupper();
307
0
#endif
308
0
}
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)1, 64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInv_SSE<(vvenc::x86_simd::X86_VEXT)4, 64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
309
310
template<X86_VEXT vext, int trSize>
311
void fastFwd_SSE( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, unsigned reducedLine, unsigned cutoff, int shift )
312
0
{
313
0
  const int rnd_factor = 1 << ( shift - 1 );
314
  
315
  //for( int i = 0; i < reducedLine; i++ )
316
  //{
317
  //        TCoeff*       dstPtr = dst;
318
  //  const TMatrixCoeff* iT     = tc;
319
  //
320
  //  for( int j = 0; j < cutoff; j++ )
321
  //  {
322
  //    int sum = 0;
323
  //
324
  //    for( int k = 0; k < trSize; k++ )
325
  //    {
326
  //      // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
327
  //      sum += src[k] * iT[k];
328
  //    }
329
  //
330
  //    dstPtr[i] = ( sum + rnd_factor ) >> shift;
331
  //    dstPtr   += line;
332
  //    iT       += trSize;
333
  //  }
334
  //
335
  //  src += trSize;
336
  //}
337
338
0
  if( trSize >= 8 )
339
0
  {
340
#if USE_AVX2
341
0
    if( vext >= AVX2 && ( trSize & 15 ) == 0 )
342
0
    {
343
#if FIX_FOR_TEMPORARY_COMPILER_ISSUES_ENABLED && defined( __GNUC__ ) && !defined( __clang__ )
344
#pragma GCC diagnostic push
345
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
346
      // vsrcarr[2] and vsrcarr[3] might be unitialized for nlx4==0, but in that case they will not be used, so discard the warning!
347
#endif
348
0
      static constexpr unsigned trLoops = trSize >> 4 ? trSize >> 4 : 1;
349
350
      // is number of lines a multiplier of 4
351
0
      const int nlx4 = reducedLine == 2 ? 0 : 1;
352
353
0
      for( int i = 0; i < reducedLine; i += ( 2 << nlx4 ) )
354
0
      {
355
0
              TCoeff*       dstPtr = dst + i;
356
0
        const TMatrixCoeff* itPtr  = tc;
357
        
358
0
        __m256i vsrcarr[trLoops][4];
359
          
360
0
        for( int k = 0; k < trLoops; k++ )
361
0
        {
362
0
          __m256i vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0] );
363
0
          __m256i vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8] );
364
0
          __m256i vsrc  = _mm256_packs_epi32( vsrc0, vsrc1 );
365
0
          vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
366
367
0
          vsrcarr[k][0] = vsrc;
368
          
369
0
          vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0 + trSize] );
370
0
          vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8 + trSize] );
371
0
          vsrc  = _mm256_packs_epi32( vsrc0, vsrc1 );
372
0
          vsrc  = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
373
374
0
          vsrcarr[k][1] = vsrc;
375
376
0
          if( !nlx4 ) continue;
377
378
0
          vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0 + 2 * trSize] );
379
0
          vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8 + 2 * trSize] );
380
0
          vsrc = _mm256_packs_epi32( vsrc0, vsrc1 );
381
0
          vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
382
383
0
          vsrcarr[k][2] = vsrc;
384
385
0
          vsrc0 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 0 + 3 * trSize] );
386
0
          vsrc1 = _mm256_load_si256( ( const __m256i* ) &src[(k << 4) + 8 + 3 * trSize] );
387
0
          vsrc = _mm256_packs_epi32( vsrc0, vsrc1 );
388
0
          vsrc = _mm256_permute4x64_epi64( vsrc, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
389
390
0
          vsrcarr[k][3] = vsrc;
391
0
        }
392
393
0
        for( int j = 0; j < cutoff; j += 4 )
394
0
        {
395
0
          __m256i vsum00 = _mm256_setzero_si256();
396
0
          __m256i vsum02 = _mm256_setzero_si256();
397
398
0
          __m256i vsum10 = _mm256_setzero_si256();
399
0
          __m256i vsum12 = _mm256_setzero_si256();
400
          
401
0
          __m256i vsum20 = _mm256_setzero_si256();
402
0
          __m256i vsum22 = _mm256_setzero_si256();
403
404
0
          __m256i vsum30 = _mm256_setzero_si256();
405
0
          __m256i vsum32 = _mm256_setzero_si256();
406
407
0
          for( int k = 0; k < trLoops; k++ )
408
0
          {
409
            // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
410
411
#if 0
412
#if defined( _MSC_VER ) && _MSC_VER > 1900
413
            __m256i vit0  = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 0 * trSize] );
414
            __m256i vit1  = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 1 * trSize] );
415
            __m256i vit2  = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 2 * trSize] );
416
            __m256i vit3  = _mm256_stream_load_si256( ( const __m256i* ) &itPtr[k + 3 * trSize] );
417
#else
418
            __m256i vit0  = _mm256_stream_load_si256( (       __m256i* ) &itPtr[k + 0 * trSize] );
419
            __m256i vit1  = _mm256_stream_load_si256( (       __m256i* ) &itPtr[k + 1 * trSize] );
420
            __m256i vit2  = _mm256_stream_load_si256( (       __m256i* ) &itPtr[k + 2 * trSize] );
421
            __m256i vit3  = _mm256_stream_load_si256( (       __m256i* ) &itPtr[k + 3 * trSize] );
422
#endif
423
#else
424
0
            __m256i vit0  = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 0 * trSize] );
425
0
            __m256i vit1  = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 1 * trSize] );
426
0
            __m256i vit2  = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 2 * trSize] );
427
0
            __m256i vit3  = _mm256_load_si256( ( const __m256i* ) &itPtr[(k << 4) + 3 * trSize] );
428
0
#endif
429
430
            __m256i
431
            vsrc  = vsrcarr[k][0];
432
433
            vsum00 = _mm256_add_epi32( vsum00, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) );
434
            vsum02 = _mm256_add_epi32( vsum02, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) );
435
     
436
            vsrc  = vsrcarr[k][1];
437
438
            vsum10 = _mm256_add_epi32( vsum10, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) );
439
            vsum12 = _mm256_add_epi32( vsum12, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) );
440
441
            // skip branching
442
            //if( !nlx4 ) continue;
443
     
444
            vsrc  = vsrcarr[k][2];
445
446
            vsum20 = _mm256_add_epi32( vsum20, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) );
447
            vsum22 = _mm256_add_epi32( vsum22, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) );
448
            
449
            vsrc  = vsrcarr[k][3];
450
451
            vsum30 = _mm256_add_epi32( vsum30, _mm256_hadd_epi32( _mm256_madd_epi16( vit0, vsrc ), _mm256_madd_epi16( vit1, vsrc ) ) );
452
            vsum32 = _mm256_add_epi32( vsum32, _mm256_hadd_epi32( _mm256_madd_epi16( vit2, vsrc ), _mm256_madd_epi16( vit3, vsrc ) ) );
453
          }
454
455
          vsum00 = _mm256_hadd_epi32( vsum00, vsum02 );
456
457
          __m128i xsum00 = _mm_add_epi32( _mm256_castsi256_si128( vsum00 ), _mm256_extracti128_si256( vsum00, 1 ) );
458
          xsum00 = _mm_add_epi32 ( xsum00, _mm_set1_epi32( rnd_factor ) );
459
          xsum00 = _mm_srai_epi32( xsum00, shift );
460
461
          vsum10 = _mm256_hadd_epi32( vsum10, vsum12 );
462
          
463
          __m128i xsum10 = _mm_add_epi32( _mm256_castsi256_si128( vsum10 ), _mm256_extracti128_si256( vsum10, 1 ) );
464
          xsum10 = _mm_add_epi32 ( xsum10, _mm_set1_epi32( rnd_factor ) );
465
          xsum10 = _mm_srai_epi32( xsum10, shift );
466
467
0
          if( nlx4 )
468
0
          {
469
0
            vsum20 = _mm256_hadd_epi32( vsum20, vsum22 );
470
471
0
            __m128i xsum20 = _mm_add_epi32( _mm256_castsi256_si128( vsum20 ), _mm256_extracti128_si256( vsum20, 1 ) );
472
0
            xsum20 = _mm_add_epi32( xsum20, _mm_set1_epi32( rnd_factor ) );
473
0
            xsum20 = _mm_srai_epi32( xsum20, shift );
474
475
0
            vsum30 = _mm256_hadd_epi32( vsum30, vsum32 );
476
477
0
            __m128i xsum30 = _mm_add_epi32( _mm256_castsi256_si128( vsum30 ), _mm256_extracti128_si256( vsum30, 1 ) );
478
0
            xsum30 = _mm_add_epi32( xsum30, _mm_set1_epi32( rnd_factor ) );
479
0
            xsum30 = _mm_srai_epi32( xsum30, shift );
480
481
0
            __m128i xtmp0 = _mm_unpacklo_epi32( xsum00, xsum10 );
482
0
            __m128i xtmp1 = _mm_unpacklo_epi32( xsum20, xsum30 );
483
484
0
            _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpacklo_epi64( xtmp0, xtmp1 ) ); dstPtr += line;
485
0
            _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp0, xtmp1 ) ); dstPtr += line;
486
487
0
            xtmp0 = _mm_unpackhi_epi32( xsum00, xsum10 );
488
0
            xtmp1 = _mm_unpackhi_epi32( xsum20, xsum30 );
489
490
0
            _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpacklo_epi64( xtmp0, xtmp1 ) ); dstPtr += line;
491
0
            _mm_store_si128( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp0, xtmp1 ) ); dstPtr += line;
492
0
          }
493
0
          else
494
0
          {
495
0
            __m128i xtmp = _mm_unpacklo_epi32( xsum00, xsum10 );
496
497
0
            _vv_storel_epi64( ( __m128i* ) dstPtr,                     xtmp );         dstPtr += line;
498
0
            _vv_storel_epi64( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp, xtmp ) ); dstPtr += line;
499
500
0
            xtmp = _mm_unpackhi_epi32( xsum00, xsum10 );
501
502
0
            _vv_storel_epi64( ( __m128i* ) dstPtr,                     xtmp );         dstPtr += line;
503
0
            _vv_storel_epi64( ( __m128i* ) dstPtr, _mm_unpackhi_epi64( xtmp, xtmp ) ); dstPtr += line;
504
0
          }
505
506
          itPtr  += ( trSize << 2 );
507
        }
508
509
0
        src += ( trSize << ( 1 + nlx4 ) );
510
0
      }
511
#if FIX_FOR_TEMPORARY_COMPILER_ISSUES_ENABLED && defined( __GNUC__ ) && !defined( __clang__ )
512
#pragma GCC diagnostic pop
513
#endif
514
0
    }
515
0
    else
516
0
#endif
517
0
    {
518
0
      static constexpr unsigned trLoops = trSize >> 3 ? trSize >> 3 : 1;
519
520
0
      for( int i = 0; i < reducedLine; i += 2 )
521
0
      {
522
0
              TCoeff*       dstPtr = dst + i;
523
0
        const TMatrixCoeff* itPtr  = tc;
524
     
525
0
        __m128i vsrcarr[trLoops][2];
526
          
527
0
        for( int k = 0; k < trLoops; k++ )
528
0
        {
529
0
          __m128i vsrc0 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 0] );
530
0
          __m128i vsrc1 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 4] );
531
0
          __m128i vsrc  = _mm_packs_epi32( vsrc0, vsrc1 );
532
533
0
          vsrcarr[k][0] = vsrc;
534
          
535
0
          vsrc0 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 0 + trSize] );
536
0
          vsrc1 = _mm_load_si128( ( const __m128i* ) &src[(k << 3) + 4 + trSize] );
537
0
          vsrc  = _mm_packs_epi32( vsrc0, vsrc1 );
538
539
0
          vsrcarr[k][1] = vsrc;
540
0
        }
541
542
0
        for( int j = 0; j < cutoff; j += 4 )
543
0
        {
544
0
          __m128i vsum00 = _mm_setzero_si128();
545
0
          __m128i vsum02 = _mm_setzero_si128();
546
        
547
0
          __m128i vsum10 = _mm_setzero_si128();
548
0
          __m128i vsum12 = _mm_setzero_si128();
549
550
0
          for( int k = 0; k < trLoops; k++ )
551
0
          {
552
            // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
553
554
  #if 0
555
  #if defined( _MSC_VER ) && _MSC_VER > 1900
556
            __m128i vit0  = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 0 * trSize] );
557
            __m128i vit1  = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 1 * trSize] );
558
            __m128i vit2  = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 2 * trSize] );
559
            __m128i vit3  = _mm_stream_load_si128( ( const __m128i* ) &itPtr[k + 3 * trSize] );
560
  #else
561
            __m128i vit0  = _mm_stream_load_si128( (       __m128i* ) &itPtr[k + 0 * trSize] );
562
            __m128i vit1  = _mm_stream_load_si128( (       __m128i* ) &itPtr[k + 1 * trSize] );
563
            __m128i vit2  = _mm_stream_load_si128( (       __m128i* ) &itPtr[k + 2 * trSize] );
564
            __m128i vit3  = _mm_stream_load_si128( (       __m128i* ) &itPtr[k + 3 * trSize] );
565
  #endif
566
  #else
567
0
            __m128i vit0  = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 0 * trSize] );
568
0
            __m128i vit1  = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 1 * trSize] );
569
0
            __m128i vit2  = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 2 * trSize] );
570
0
            __m128i vit3  = _mm_load_si128( ( const __m128i* ) &itPtr[(k << 3) + 3 * trSize] );
571
0
  #endif
572
            
573
            // fist source line
574
0
            __m128i vsrc  = vsrcarr[k][0];
575
576
0
            vsum00 = _mm_add_epi32( vsum00, _mm_hadd_epi32( _mm_madd_epi16( vit0, vsrc ), _mm_madd_epi16( vit1, vsrc ) ) );
577
0
            vsum02 = _mm_add_epi32( vsum02, _mm_hadd_epi32( _mm_madd_epi16( vit2, vsrc ), _mm_madd_epi16( vit3, vsrc ) ) );
578
          
579
            // second source line
580
0
            vsrc   = vsrcarr[k][1];
581
582
0
            vsum10 = _mm_add_epi32( vsum10, _mm_hadd_epi32( _mm_madd_epi16( vit0, vsrc ), _mm_madd_epi16( vit1, vsrc ) ) );
583
0
            vsum12 = _mm_add_epi32( vsum12, _mm_hadd_epi32( _mm_madd_epi16( vit2, vsrc ), _mm_madd_epi16( vit3, vsrc ) ) );
584
0
          }
585
586
0
          vsum00 = _mm_hadd_epi32( vsum00, vsum02 );
587
0
          vsum00 = _mm_add_epi32 ( vsum00, _mm_set1_epi32( rnd_factor ) );
588
0
          vsum00 = _mm_srai_epi32( vsum00, shift );
589
590
0
          vsum10 = _mm_hadd_epi32( vsum10, vsum12 );
591
0
          vsum10 = _mm_add_epi32 ( vsum10, _mm_set1_epi32( rnd_factor ) );
592
0
          vsum10 = _mm_srai_epi32( vsum10, shift );
593
594
0
          __m128i xtmp = _mm_unpacklo_epi32( vsum00, vsum10 );
595
0
          _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line;
596
597
0
          xtmp = _mm_shuffle_epi32( xtmp, ( 2 << 0 ) + ( 3 << 2 ) );
598
0
          _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line;
599
          
600
0
          xtmp = _mm_unpackhi_epi32( vsum00, vsum10 );
601
0
          _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line;
602
603
0
          xtmp = _mm_shuffle_epi32( xtmp, ( 2 << 0 ) + ( 3 << 2 ) );
604
0
          _vv_storel_epi64( ( __m128i* ) dstPtr, xtmp ); dstPtr += line;
605
606
0
          itPtr  += ( trSize << 2 );
607
0
        }
608
609
0
        src += ( trSize << 1 );
610
0
      }
611
0
    }
612
0
  }
613
0
  else
614
0
  {
615
0
    __m128i vzero = _mm_setzero_si128();
616
617
0
    for( int i = 0; i < reducedLine; i++ )
618
0
    {
619
0
            TCoeff*       dstPtr = dst;
620
0
      const TMatrixCoeff* itPtr  = tc;
621
622
0
      for( int j = 0; j < cutoff; j++ )
623
0
      {
624
0
        __m128i vit   = _vv_loadl_epi64( ( const __m128i* ) itPtr );
625
0
        __m128i vsrc0 = _mm_load_si128 ( ( const __m128i* ) src );
626
627
0
        __m128i vsrc = _mm_packs_epi32( vsrc0, vzero );
628
0
        __m128i vsum = _mm_madd_epi16 ( vit, vsrc );
629
630
0
        dstPtr[i] = ( _mm_extract_epi32( vsum, 0 ) + _mm_extract_epi32( vsum, 1 ) + rnd_factor ) >> shift;
631
632
0
        dstPtr += line;
633
0
        itPtr  += trSize;
634
0
      }
635
636
0
      src += trSize;
637
0
    }
638
0
  }
639
#if USE_AVX2
640
641
  _mm256_zeroupper();
642
#endif
643
0
}
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)1, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 16>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 32>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwd_SSE<(vvenc::x86_simd::X86_VEXT)4, 64>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
644
645
template< X86_VEXT vext, int W >
646
void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
647
0
{
648
#if USE_AVX2
649
0
  if( W >= 8 && vext >= AVX2 )
650
0
  {
651
0
    __m256i vmin = _mm256_set1_epi32( outputMin );
652
0
    __m256i vmax = _mm256_set1_epi32( outputMax );
653
0
    __m256i vrnd = _mm256_set1_epi32( round );
654
655
0
    while( height-- )
656
0
    {
657
0
      for( int col = 0; col < width; col += 8 )
658
0
      {
659
0
        __m256i
660
0
        vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
661
0
        vdst = _mm256_add_epi32 ( vdst, vrnd );
662
0
        vdst = _mm256_srai_epi32( vdst, shift );
663
0
        vdst = _mm256_max_epi32 ( vdst, vmin );
664
0
        vdst = _mm256_min_epi32 ( vdst, vmax );
665
0
        _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
666
0
      }
667
668
0
      dst += stride;
669
0
    }
670
0
  }
671
0
  else
672
0
#endif
673
0
  if( W >= 4 )
674
0
  {
675
0
    __m128i vmin = _mm_set1_epi32( outputMin );
676
0
    __m128i vmax = _mm_set1_epi32( outputMax );
677
0
    __m128i vrnd = _mm_set1_epi32( round );
678
679
0
    while( height-- )
680
0
    {
681
0
      for( int col = 0; col < width; col += 4 )
682
0
      {
683
0
        __m128i
684
0
        vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
685
0
        vdst = _mm_add_epi32  ( vdst, vrnd );
686
0
        vdst = _mm_srai_epi32 ( vdst, shift );
687
0
        vdst = _mm_max_epi32  ( vdst, vmin );
688
0
        vdst = _mm_min_epi32  ( vdst, vmax );
689
0
        _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
690
0
      }
691
692
0
      dst += stride;
693
0
    }
694
0
  }
695
0
  else
696
0
  {
697
0
    THROW( "Unsupported size" );
698
0
  }
699
#if USE_AVX2
700
701
0
  _mm256_zeroupper();
702
0
#endif
703
0
}
Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
Unexecuted instantiation: void vvenc::roundClip_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(int*, unsigned int, unsigned int, unsigned int, int, int, int, int)
704
705
template< X86_VEXT vext, int W >
706
void cpyResi_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height )
707
0
{
708
#if USE_AVX2
709
0
  if( W >= 8 && vext >= AVX2 )
710
0
  {
711
0
    while( height-- )
712
0
    {
713
0
      for( int col = 0; col < width; col += 8 )
714
0
      {
715
0
        __m256i
716
0
        vsrc = _mm256_load_si256        ( ( const __m256i * ) &src[col] );
717
0
        __m128i
718
0
        vdst = _mm_packs_epi32          ( _mm256_castsi256_si128( vsrc ), _mm256_extracti128_si256( vsrc, 1 ) );
719
0
        _mm_storeu_si128                ( ( __m128i * ) &dst[col], vdst );
720
0
      }
721
722
0
      src += width;
723
0
      dst += stride;
724
0
    }
725
0
  }
726
0
  else
727
0
#endif
728
0
  if( W >= 4 )
729
0
  {
730
0
    __m128i vzero = _mm_setzero_si128();
731
0
    __m128i vdst;
732
733
0
    while( height-- )
734
0
    {
735
0
      for( int col = 0; col < width; col += 4 )
736
0
      {
737
0
        vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
738
0
        vdst = _mm_packs_epi32( vdst, vzero );
739
0
        _vv_storel_epi64      ( ( __m128i * ) &dst[col], vdst );
740
0
      }
741
742
0
      src += width;
743
0
      dst += stride;
744
0
    }
745
0
  }
746
0
  else
747
0
  {
748
0
    THROW( "Unsupported size" );
749
0
  }
750
#if USE_AVX2
751
752
0
  _mm256_zeroupper();
753
0
#endif
754
0
}
Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(int const*, short*, long, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(int const*, short*, long, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(int const*, short*, long, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::cpyResi_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(int const*, short*, long, unsigned int, unsigned int)
755
756
template< X86_VEXT vext, int W >
757
void cpyCoeff_SSE( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height )
758
0
{
759
#if USE_AVX2
760
0
  if( W >= 8 && vext >= AVX2 )
761
0
  {
762
0
    while( height-- )
763
0
    {
764
0
      for( int col = 0; col < width; col += 8 )
765
0
      {
766
0
        __m256i vtmp = _mm256_cvtepi16_epi32( _mm_loadu_si128( ( const __m128i * ) &src[col] ) );
767
0
        _mm256_store_si256( ( __m256i * ) &dst[col], vtmp );
768
0
      }
769
770
0
      src += stride;
771
0
      dst += width;
772
0
    }
773
0
  }
774
0
  else
775
0
#endif
776
0
  if( W >= 4 )
777
0
  {
778
0
    while( height-- )
779
0
    {
780
0
      for( int col = 0; col < width; col += 4 )
781
0
      {
782
0
        __m128i vtmp = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( const __m128i * ) &src[col] ) );
783
0
        _mm_store_si128( ( __m128i * ) &dst[col], vtmp );
784
0
      }
785
786
0
      src += stride;
787
0
      dst += width;
788
0
    }
789
0
  }
790
0
  else
791
0
  {
792
0
    THROW( "Unsupported size" );
793
0
  }
794
#if USE_AVX2
795
796
0
  _mm256_zeroupper();
797
0
#endif
798
0
}
Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, long, int*, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, long, int*, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, long, int*, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::cpyCoeff_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, long, int*, unsigned int, unsigned int)
799
800
template<X86_VEXT vext>
801
void simdInvLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize )
802
0
{
803
0
  CHECK( index > 2 || ( zeroOutSize != 8 && zeroOutSize != 16 ), "Wrong parameters" );
804
805
0
  static constexpr int maxLog2TrDynamicRange = 15;
806
0
  const TCoeff    outputMinimum = -( 1 << maxLog2TrDynamicRange );
807
0
  const TCoeff    outputMaximum =  ( 1 << maxLog2TrDynamicRange ) - 1;
808
0
  const int8_t*   trMat         = ( size > 4 ) ? g_lfnstInv8x8[mode][index][0] : g_lfnstInv4x4[mode][index][0];
809
0
  const int       trSize        = ( size > 4 ) ? 48 : 16;
810
0
  int*            out           = dst;
811
812
0
  const __m128i vzero = _mm_setzero_si128();
813
0
  const __m128i vmin  = _mm_set1_epi32( outputMinimum );
814
0
  const __m128i vmax  = _mm_set1_epi32( outputMaximum );
815
816
0
  for( int j = 0; j < trSize; j += 4, out += 4 )
817
0
  {
818
0
    __m128i       vsum[4];
819
820
0
    for( int k = 0; k < 4; k++, trMat += 16 )
821
0
    {
822
0
      const int8_t* trMatTmp = trMat;
823
0
      int* srcPtr = src;
824
825
0
      __m128i vsrc;
826
0
      __m128i vtr;
827
0
      __m128i vtmp;
828
0
      __m128i vcur = vzero;
829
830
0
      for( int i = 0; i < zeroOutSize; i += 8, srcPtr += 8, trMatTmp += 8 )
831
0
      {
832
0
        vsrc = _mm_loadu_si128( ( const __m128i* ) srcPtr );
833
0
        vtr  = _vv_loadl_epi64( ( const __m128i* ) trMatTmp );
834
0
        vtr  = _mm_cvtepi8_epi16( vtr );
835
0
        vtmp = _mm_cvtepi16_epi32( vtr );
836
837
0
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
838
0
        vcur = _mm_add_epi32( vtmp, vcur );
839
840
0
        vsrc = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
841
0
        vtmp = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtr, vzero ) );
842
      
843
0
        vtmp = _mm_mullo_epi32( vsrc, vtmp );
844
0
        vcur = _mm_add_epi32( vtmp, vcur );
845
0
      }
846
847
0
      vsum[k] = vcur;
848
0
    }
849
850
0
    __m128i vout = _mm_hadd_epi32( _mm_hadd_epi32( vsum[0], vsum[1] ), _mm_hadd_epi32( vsum[2], vsum[3] ) );
851
0
    vout = _mm_add_epi32( vout, _mm_set1_epi32( 64 ) );
852
0
    vout = _mm_srai_epi32( vout, 7 );
853
0
    vout = _mm_min_epi32( _mm_max_epi32( vmin, vout ), vmax );
854
855
0
    _mm_storeu_si128( ( __m128i* ) out, vout );
856
0
  }
857
0
}
Unexecuted instantiation: void vvenc::simdInvLfnstNxN<(vvenc::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::simdInvLfnstNxN<(vvenc::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int)
858
859
template<X86_VEXT vext>
860
void simdFwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize )
861
0
{
862
0
  const int8_t *trMat  = ( size > 4 ) ? g_lfnstFwd8x8[mode][index][0] : g_lfnstFwd4x4[mode][index][0];
863
0
  const int     trSize = ( size > 4 ) ? 48 : 16;
864
0
  int *         out    = dst;
865
866
0
  const __m128i vzero  = _mm_setzero_si128();
867
868
0
  for( int j = 0; j < zeroOutSize; j += 4, out += 4 )
869
0
  {
870
0
    __m128i vout[4];
871
872
0
    for( int k = 0; k < 4; k++ )
873
0
    {
874
0
      int* srcPtr = src;
875
0
      const int8_t* trMatTmp = trMat;
876
877
0
      __m128i vsum = vzero;
878
879
0
      for( int i = 0; i < trSize; i += 16, srcPtr += 16, trMatTmp += 16 )
880
0
      {
881
0
        __m128i vtrc = _mm_loadu_si128( ( const __m128i* ) trMatTmp );
882
0
        __m128i vtrl = _mm_cvtepi8_epi16( vtrc );
883
0
        __m128i vtrh = _mm_cvtepi8_epi16( _mm_unpackhi_epi64( vtrc, vzero ) );
884
885
0
        __m128i vsrc0 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[0] );
886
0
                vtrc  = _mm_cvtepi16_epi32( vtrl );
887
0
                vsrc0 = _mm_mullo_epi32( vsrc0, vtrc );
888
              
889
0
        __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[4] );
890
0
                vtrc  = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtrl, vzero ) );
891
0
                vsrc1 = _mm_mullo_epi32( vsrc1, vtrc );
892
              
893
0
        __m128i vsrc2 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[8] );
894
0
                vtrc  = _mm_cvtepi16_epi32( vtrh );
895
0
                vsrc2 = _mm_mullo_epi32( vsrc2, vtrc );
896
              
897
0
        __m128i vsrc3 = _mm_loadu_si128( ( const __m128i* ) &srcPtr[12] );
898
0
                vtrc  = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vtrh, vzero ) );
899
0
                vsrc3 = _mm_mullo_epi32( vsrc3, vtrc );
900
901
0
        vsrc0 = _mm_add_epi32( vsrc0, vsrc1 );
902
0
        vsrc2 = _mm_add_epi32( vsrc2, vsrc3 );
903
904
0
        vsum = _mm_add_epi32( vsum, _mm_add_epi32( vsrc0, vsrc2 ) );
905
0
      }
906
907
0
      vout[k] = vsum;
908
0
      trMat += trSize;
909
0
    }
910
911
0
    __m128i vdst = _mm_hadd_epi32( _mm_hadd_epi32( vout[0], vout[1] ), _mm_hadd_epi32( vout[2], vout[3] ) );
912
0
    vdst = _mm_add_epi32( vdst, _mm_set1_epi32( 64 ) );
913
0
    vdst = _mm_srai_epi32( vdst, 7 );
914
915
0
    _mm_storeu_si128( ( __m128i* ) out, vdst );
916
0
  }
917
918
0
  ::memset( out, 0, ( trSize - zeroOutSize ) * sizeof( int ) );
919
0
}
Unexecuted instantiation: void vvenc::simdFwdLfnstNxN<(vvenc::x86_simd::X86_VEXT)1>(int*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::simdFwdLfnstNxN<(vvenc::x86_simd::X86_VEXT)4>(int*, int*, unsigned int, unsigned int, unsigned int, int)
920
921
template<X86_VEXT vext>
922
void TCoeffOps::_initTCoeffOpsX86()
923
0
{
924
0
  cpyResi4     = cpyResi_SSE  <vext, 4>;
925
0
  cpyResi8     = cpyResi_SSE  <vext, 8>;
926
0
  cpyCoeff4    = cpyCoeff_SSE <vext, 4>;
927
0
  cpyCoeff8    = cpyCoeff_SSE <vext, 8>;
928
0
  roundClip4   = roundClip_SSE<vext, 4>;
929
0
  roundClip8   = roundClip_SSE<vext, 8>;
930
931
0
  fastInvCore[0] = fastInv_SSE<vext,  4>;
932
0
  fastInvCore[1] = fastInv_SSE<vext,  8>;
933
0
  fastInvCore[2] = fastInv_SSE<vext, 16>;
934
0
  fastInvCore[3] = fastInv_SSE<vext, 32>;
935
0
  fastInvCore[4] = fastInv_SSE<vext, 64>;
936
937
0
  fastFwdCore_2D[0] = fastFwd_SSE<vext,  4>;
938
0
  fastFwdCore_2D[1] = fastFwd_SSE<vext,  8>;
939
0
  fastFwdCore_2D[2] = fastFwd_SSE<vext, 16>;
940
0
  fastFwdCore_2D[3] = fastFwd_SSE<vext, 32>;
941
0
  fastFwdCore_2D[4] = fastFwd_SSE<vext, 64>;
942
0
}
Unexecuted instantiation: void vvenc::TCoeffOps::_initTCoeffOpsX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::TCoeffOps::_initTCoeffOpsX86<(vvenc::x86_simd::X86_VEXT)4>()
943
944
template<X86_VEXT vext>
945
void TrQuant::_initTrQuantX86()
946
0
{
947
0
  m_invLfnstNxN = simdInvLfnstNxN<vext>;
948
0
  m_fwdLfnstNxN  = simdFwdLfnstNxN<vext>;
949
0
}
Unexecuted instantiation: void vvenc::TrQuant::_initTrQuantX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::TrQuant::_initTrQuantX86<(vvenc::x86_simd::X86_VEXT)4>()
950
951
template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>();
952
template void TrQuant::_initTrQuantX86<SIMDX86>();
953
954
}
955
956
#endif // TARGET_SIMD_X86
957
#endif
958
//! \}