Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/BufferX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     YuvX86.cpp
43
    \brief    SIMD averaging.
44
*/
45
46
#pragma once
47
48
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1
49
50
#include "CommonDefX86.h"
51
#include "Unit.h"
52
#include "InterpolationFilter.h"
53
54
#ifdef TARGET_SIMD_X86
55
#if ENABLE_SIMD_OPT_BUFFER
56
57
//! \ingroup CommonLib
58
//! \{
59
60
namespace vvenc {
61
62
#if USE_AVX2
63
template<bool isAligned> static inline __m256i load_aligned_avx2       ( const void* addr );
64
0
template<>                      inline __m256i load_aligned_avx2<true> ( const void* addr ) { return _mm256_load_si256 ( (const __m256i *) addr );}
65
0
template<>                      inline __m256i load_aligned_avx2<false>( const void* addr ) { return _mm256_loadu_si256( (const __m256i *) addr );}
66
#endif
67
68
template<bool isAligned> static inline __m128i load_aligned       ( const void* addr );
69
0
template<>                      inline __m128i load_aligned<true> ( const void* addr ) { return _mm_load_si128 ( (const __m128i *) addr );}
Unexecuted instantiation: Buffer_sse41.cpp:long long __vector(2) vvenc::load_aligned<true>(void const*)
Unexecuted instantiation: Buffer_avx2.cpp:long long __vector(2) vvenc::load_aligned<true>(void const*)
70
0
template<>                      inline __m128i load_aligned<false>( const void* addr ) { return _mm_loadu_si128( (const __m128i *) addr );}
Unexecuted instantiation: Buffer_sse41.cpp:long long __vector(2) vvenc::load_aligned<false>(void const*)
Unexecuted instantiation: Buffer_avx2.cpp:long long __vector(2) vvenc::load_aligned<false>(void const*)
71
72
template< X86_VEXT vext >
73
void weightCiip_SSE( Pel* res, const Pel* src, const int numSamples, int numIntra )
74
0
{
75
#if USE_AVX2
76
  int n = 16;
77
0
  if( numIntra == 1 )
78
0
  {
79
0
    __m256i vres;
80
0
    __m256i vpred = _mm256_load_si256((const __m256i*)&res[0]);
81
0
    __m256i vsrc  = _mm256_load_si256((const __m256i*)&src[0]);
82
0
    for( ; n < numSamples; n+=16)
83
0
    {
84
0
      vres = _mm256_avg_epu16( vpred, vsrc );
85
0
      vpred = _mm256_load_si256((const __m256i*)&res[n]);
86
0
      vsrc  = _mm256_load_si256((const __m256i*)&src[n]);
87
0
      _mm256_storeu_si256( ( __m256i * )&res[n-16], vres );
88
0
    }
89
0
    vres = _mm256_avg_epu16( vpred, vsrc );
90
0
    _mm256_storeu_si256( ( __m256i * )&res[n-16], vres );
91
0
  }
92
0
  else
93
0
  {
94
0
    const Pel* scale   = ( numIntra == 0 ) ? res : src;
95
0
    const Pel* unscale = ( numIntra == 0 ) ? src : res;
96
97
0
    __m256i vres;
98
0
    __m256i voffset = _mm256_set1_epi16(2);
99
0
    __m256i vscl = _mm256_load_si256((const __m256i*)&scale[0]);
100
0
    __m256i vuns = _mm256_load_si256((const __m256i*)&unscale[0]);
101
0
    for( ; n < numSamples; n+=16)
102
0
    {
103
0
      vres = _mm256_srai_epi16( _mm256_adds_epi16( _mm256_adds_epi16(_mm256_adds_epi16( vscl, vscl),_mm256_adds_epi16( vscl, vuns)), voffset), 2 );
104
0
      vscl = _mm256_load_si256((const __m256i*)&scale[n]);
105
0
      vuns = _mm256_load_si256((const __m256i*)&unscale[n]);
106
0
      _mm256_storeu_si256( ( __m256i * )&res[n-16], vres );
107
0
    }
108
0
    vres = _mm256_srai_epi16( _mm256_adds_epi16( _mm256_adds_epi16(_mm256_adds_epi16( vscl, vscl),_mm256_adds_epi16( vscl, vuns)), voffset), 2 );
109
0
    _mm256_storeu_si256( ( __m256i * )&res[n-16], vres );
110
0
  }
111
#else
112
  int n = 8;
113
0
  if( numIntra == 1 )
114
0
  {
115
0
    __m128i vres;
116
0
    __m128i vpred = _mm_load_si128((const __m128i*)&res[0]);
117
0
    __m128i vsrc  = _mm_load_si128((const __m128i*)&src[0]);
118
0
    for( ; n < numSamples; n+=8)
119
0
    {
120
0
      vres = _mm_avg_epu16( vpred, vsrc );
121
0
      vpred = _mm_load_si128((const __m128i*)&res[n]);
122
0
      vsrc  = _mm_load_si128((const __m128i*)&src[n]);
123
0
      _mm_storeu_si128( ( __m128i * )&res[n-8], vres );
124
0
    }
125
0
    vres = _mm_avg_epu16( vpred, vsrc );
126
0
    _mm_storeu_si128( ( __m128i * )&res[n-8], vres );
127
0
  }
128
0
  else
129
0
  {
130
0
    const Pel* scale   = ( numIntra == 0 ) ? res : src;
131
0
    const Pel* unscale = ( numIntra == 0 ) ? src : res;
132
133
0
    __m128i vres;
134
0
    __m128i voffset = _mm_set1_epi16(2);
135
0
    __m128i vscl = _mm_load_si128((const __m128i*)&scale[0]);
136
0
    __m128i vuns = _mm_load_si128((const __m128i*)&unscale[0]);
137
0
    for( ; n < numSamples; n+=8)
138
0
    {
139
0
      vres = _mm_srai_epi16( _mm_adds_epi16( _mm_adds_epi16(_mm_adds_epi16( vscl, vscl),_mm_adds_epi16( vscl, vuns)), voffset), 2 );
140
0
      vscl = _mm_load_si128((const __m128i*)&scale[n]);
141
0
      vuns = _mm_load_si128((const __m128i*)&unscale[n]);
142
0
      _mm_storeu_si128( ( __m128i * )&res[n-8], vres );
143
0
    }
144
0
    vres = _mm_srai_epi16( _mm_adds_epi16( _mm_adds_epi16(_mm_adds_epi16( vscl, vscl),_mm_adds_epi16( vscl, vuns)), voffset), 2 );
145
0
    _mm_storeu_si128( ( __m128i * )&res[n-8], vres );
146
0
  }
147
#endif
148
0
}
Unexecuted instantiation: void vvenc::weightCiip_SSE<(vvenc::x86_simd::X86_VEXT)1>(short*, short const*, int, int)
Unexecuted instantiation: void vvenc::weightCiip_SSE<(vvenc::x86_simd::X86_VEXT)4>(short*, short const*, int, int)
149
150
template< X86_VEXT vext, unsigned inputSize, unsigned outputSize >
151
void mipMatrixMul_SSE( Pel* res, const Pel* input, const uint8_t* weight, const int maxVal, const int inputOffset, bool transpose )
152
0
{
153
0
  int sum = 0;
154
0
  for( int i = 0; i < inputSize; i++ ) { sum += input[i]; }
155
0
  const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum + (inputOffset << MIP_SHIFT_MATRIX);
156
0
  CHECK( inputSize != 4 * (inputSize >> 2), "Error, input size not divisible by four" );
157
158
#if USE_AVX2
159
#if !ENABLE_VALGRIND_CODE
160
  static
161
#endif
162
  const __m256i perm = _mm256_setr_epi32(0,4,1,5,2,6,3,7);
163
  __m256i vibdimin  = _mm256_set1_epi16( 0 );
164
  __m256i vibdimax  = _mm256_set1_epi16( maxVal );
165
0
  if( inputSize == 4 && outputSize == 4)
166
0
  {
167
0
    __m256i voffset   = _mm256_set1_epi32( offset );
168
0
    __m256i vin = _mm256_set1_epi64x( *((const int64_t*)input) );
169
0
    __m256i vw = _mm256_load_si256((const __m256i*)(weight));
170
171
    __m256i w0 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( vw ) );   //w0 - w16
172
    __m256i w1 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( vw, 1 ) ); //w16 -w 32
173
     
174
    __m256i r0 = _mm256_madd_epi16( vin, w0 );
175
    __m256i r1 = _mm256_madd_epi16( vin, w1 );
176
    __m256i r2 = _mm256_hadd_epi32( r0 , r1);
177
           
178
            r2 = _mm256_add_epi32( r2, voffset );
179
    __m256i r3 = _mm256_srai_epi32( r2, MIP_SHIFT_MATRIX );
180
181
            vw = _mm256_load_si256((const __m256i*)(weight+32));
182
            w0 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( vw ) );   //w0 - w16
183
            w1 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( vw, 1 ) ); //w16 -w 32
184
     
185
            r0 = _mm256_madd_epi16( vin, w0 );
186
            r1 = _mm256_madd_epi16( vin, w1 );
187
            r2 = _mm256_hadd_epi32( r0 , r1);
188
189
            r2 = _mm256_add_epi32( r2, voffset );
190
            r2 = _mm256_srai_epi32( r2, MIP_SHIFT_MATRIX );
191
            r2 = _mm256_packs_epi32( r3, r2 );
192
            r2 = _mm256_permutevar8x32_epi32 ( r2, perm );
193
194
            r2 = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, r2 ) );
195
196
0
      if( transpose )
197
0
      {
198
0
        __m256i vshuf0 = _mm256_set_epi8( 0xf, 0xe, 0xb, 0xa, 0x7, 0x6, 0x3, 0x2, 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0,
199
0
                                      0xf, 0xe, 0xb, 0xa, 0x7, 0x6, 0x3, 0x2, 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0);
200
0
         r2 = _mm256_permutevar8x32_epi32( r2, _mm256_set_epi32(7,5,3,1,6,4,2,0) );
201
0
         r2 = _mm256_shuffle_epi8 ( r2, vshuf0);
202
0
      }
203
204
      _mm256_store_si256( ( __m256i * )&res[0], r2 );
205
  }
206
0
  else if( inputSize == 8 )
207
0
  {
208
0
    __m256i voffset   = _mm256_set1_epi32( offset );
209
0
    __m128i inv =_mm_load_si128( ( __m128i* )input );
210
0
    __m256i vin = _mm256_permute2f128_si256(_mm256_castsi128_si256(inv), _mm256_castsi128_si256(inv), 2); 
211
0
    __m256i r2;
212
0
    for( int i = 0; i < outputSize*outputSize; i+=16)
213
0
    {
214
0
      __m256i vw = _mm256_load_si256((const __m256i*)(weight));
215
216
0
      __m256i w0 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( vw ) );   //w0 - w16
217
0
      __m256i w1 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( vw, 1 ) ); //w16 -w 32
218
     
219
0
      __m256i r0 = _mm256_madd_epi16( vin, w0 );
220
0
      __m256i r1 = _mm256_madd_epi16( vin, w1 );
221
0
              r2 = _mm256_hadd_epi32( r0 , r1);
222
           
223
0
              vw = _mm256_load_si256((const __m256i*)(weight+32));
224
0
              w0 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( vw ) );   //w0 - w16
225
0
              w1 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( vw, 1 ) ); //w16 -w 32
226
     
227
0
              r0 = _mm256_madd_epi16( vin, w0 );
228
0
              r1 = _mm256_madd_epi16( vin, w1 );
229
0
      __m256i r4 = _mm256_hadd_epi32( r0 , r1);
230
231
0
              r2 = _mm256_hadd_epi32( r2 , r4);
232
233
0
              r2 = _mm256_add_epi32( r2, voffset );
234
0
      __m256i r3 = _mm256_srai_epi32( r2, MIP_SHIFT_MATRIX );
235
236
0
              vw = _mm256_load_si256((const __m256i*)(weight+64));
237
238
0
              w0 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( vw ) );   //w0 - w16
239
0
              w1 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( vw, 1 ) ); //w16 -w 32
240
     
241
0
              r0 = _mm256_madd_epi16( vin, w0 );
242
0
              r1 = _mm256_madd_epi16( vin, w1 );
243
0
              r2 = _mm256_hadd_epi32( r0 , r1);
244
           
245
0
              vw = _mm256_load_si256((const __m256i*)(weight+96));
246
0
              w0 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( vw ) );   //w0 - w16
247
0
              w1 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( vw, 1 ) ); //w16 -w 32
248
     
249
0
              r0 = _mm256_madd_epi16( vin, w0 );
250
0
              r1 = _mm256_madd_epi16( vin, w1 );
251
252
0
              r2 = _mm256_hadd_epi32( r2 , _mm256_hadd_epi32( r0 , r1));
253
254
0
              r2 = _mm256_add_epi32( r2, voffset );
255
0
              r2 = _mm256_srai_epi32( r2, MIP_SHIFT_MATRIX );
256
257
0
              r2 = _mm256_permutevar8x32_epi32 ( r2, perm );
258
0
              r3 = _mm256_permutevar8x32_epi32 ( r3, perm );
259
260
0
              r3 = _mm256_packs_epi32( r3, r2 );
261
0
              r2 = _mm256_permute4x64_epi64( r3, 0xd8 );
262
263
0
              r2 = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, r2 ) );
264
265
0
        _mm256_store_si256( ( __m256i * )&res[0], r2 );
266
0
        res+=16;
267
0
        weight+=128;
268
0
    }
269
270
0
    if( transpose )
271
0
    {
272
0
      if( outputSize == 4 )
273
0
      {
274
0
        __m256i vshuf0 = _mm256_set_epi8( 0xf, 0xe, 0xb, 0xa, 0x7, 0x6, 0x3, 0x2, 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0,
275
0
                                      0xf, 0xe, 0xb, 0xa, 0x7, 0x6, 0x3, 0x2, 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0);
276
0
         r2 = _mm256_permutevar8x32_epi32( r2, _mm256_set_epi32(7,5,3,1,6,4,2,0) );
277
0
         r2 = _mm256_shuffle_epi8 ( r2, vshuf0);
278
0
        _mm256_store_si256( ( __m256i * )(res-16), r2 );
279
0
      }
280
0
      else
281
0
      {
282
0
        res -= 64;
283
284
0
        __m256i va, vb, vc, vd, wa, wb, wc, wd;
285
286
0
        va = _mm256_load_si256( ( const __m256i* ) res ); 
287
0
        vb = _mm256_load_si256( ( const __m256i* ) (res+16) ); 
288
0
        vc = _mm256_load_si256( ( const __m256i* ) (res+32) ); 
289
290
291
0
        va =_mm256_permute4x64_epi64(va, 0xd8); 
292
0
        vb =_mm256_permute4x64_epi64(vb, 0xd8);
293
0
        vc =_mm256_permute4x64_epi64(vc, 0xd8);
294
0
        vd =_mm256_permute4x64_epi64(r2, 0xd8);
295
296
0
        wa = _mm256_unpacklo_epi16( va, vb );
297
0
        wb = _mm256_unpackhi_epi16( va, vb );
298
0
        wc = _mm256_unpacklo_epi16( vc, vd );
299
0
        wd = _mm256_unpackhi_epi16( vc, vd );
300
301
0
        va = _mm256_unpacklo_epi16( wa, wb );
302
0
        vb = _mm256_unpackhi_epi16( wa, wb );
303
0
        vc = _mm256_unpacklo_epi16( wc, wd );
304
0
        vd = _mm256_unpackhi_epi16( wc, wd );
305
306
0
        va =_mm256_permute4x64_epi64(va, 0xd8); 
307
0
        vb =_mm256_permute4x64_epi64(vb, 0xd8);
308
0
        vc =_mm256_permute4x64_epi64(vc, 0xd8);
309
0
        vd =_mm256_permute4x64_epi64(vd, 0xd8);
310
311
0
        wa = _mm256_unpacklo_epi64( va, vc );
312
0
        wb = _mm256_unpacklo_epi64( vb, vd );
313
0
        wc = _mm256_unpackhi_epi64( va, vc );
314
0
        wd = _mm256_unpackhi_epi64( vb, vd );
315
316
0
        _mm256_store_si256( ( __m256i* ) res, wa ); 
317
0
        _mm256_store_si256( ( __m256i* ) (res+16), wb );
318
0
        _mm256_store_si256( ( __m256i* ) (res+32), wc );
319
0
        _mm256_store_si256( ( __m256i* ) (res+48), wd );
320
0
      }
321
0
    }
322
0
  }
323
#else
324
  __m128i zero  = _mm_set1_epi16( 0 );
325
  __m128i vibdimax  = _mm_set1_epi16( maxVal );
326
0
  if( inputSize == 4 && outputSize == 4)
327
0
  {
328
0
    __m128i vin = _mm_set1_epi64x( *((const int64_t*)input) );
329
0
    __m128i voffset = _mm_set1_epi32( offset );
330
0
    __m128i r2 = vin;
331
0
    __m128i r;
332
0
    for( int i = 0; i < 2; i++)
333
0
    {
334
0
             r = r2; // save the result from the first interation
335
0
    __m128i vw = _mm_load_si128((const __m128i*)weight);
336
337
    __m128i w0 = _mm_unpacklo_epi8( vw, zero );
338
    __m128i w1 = _mm_unpackhi_epi8( vw, zero );
339
     
340
    __m128i r0 = _mm_madd_epi16( vin, w0 );
341
    __m128i r1 = _mm_madd_epi16( vin, w1 );
342
            r2 = _mm_hadd_epi32( r0 , r1);
343
           
344
            r2 = _mm_add_epi32( r2, voffset );
345
    __m128i r3 = _mm_srai_epi32( r2, MIP_SHIFT_MATRIX );
346
347
            vw = _mm_load_si128((const __m128i*)(weight+16));
348
            w0 = _mm_unpacklo_epi8( vw, zero );
349
            w1 = _mm_unpackhi_epi8( vw, zero );
350
     
351
            r0 = _mm_madd_epi16( vin, w0 );
352
            r1 = _mm_madd_epi16( vin, w1 );
353
            r2 = _mm_hadd_epi32( r0 , r1);
354
355
            r2 = _mm_add_epi32( r2, voffset );
356
            r2 = _mm_srai_epi32( r2, MIP_SHIFT_MATRIX );
357
            r2 = _mm_packs_epi32( r3, r2 );
358
359
            r2 = _mm_min_epi16( vibdimax, _mm_max_epi16( zero, r2 ) );
360
361
      _mm_store_si128( ( __m128i * )&res[0], r2 );
362
      res +=8;
363
      weight += 32;
364
    }
365
366
0
    if( transpose)
367
0
    {
368
0
      __m128i vc, vd, va, vb;
369
0
      vc = _mm_unpacklo_epi16( r, r2 );
370
0
      vd = _mm_unpackhi_epi16( r, r2 );
371
 
372
0
      va = _mm_unpacklo_epi16( vc, vd );
373
0
      vb = _mm_unpackhi_epi16( vc, vd );
374
 
375
0
      _mm_store_si128( ( __m128i* ) (res-16), va ); 
376
0
      _mm_store_si128( ( __m128i* ) (res-8), vb ); 
377
0
    }
378
379
  }
380
0
  else
381
0
  {
382
0
    __m128i vin = _mm_load_si128( (const __m128i*)input);
383
0
    __m128i voffset = _mm_set1_epi32( offset );
384
385
0
    for( int i = 0; i < outputSize*outputSize; i+=4)
386
0
    {
387
0
    __m128i vw = _mm_load_si128((const __m128i*)(weight));
388
389
0
    __m128i w0 = _mm_unpacklo_epi8( vw, zero );
390
0
    __m128i w1 = _mm_unpackhi_epi8( vw, zero );
391
     
392
0
    __m128i r0 = _mm_madd_epi16( vin, w0 );
393
0
    __m128i r1 = _mm_madd_epi16( vin, w1 );
394
0
    __m128i r2 = _mm_hadd_epi32( r0 , r1);
395
           
396
0
            vw = _mm_load_si128((const __m128i*)(weight+16));
397
0
            w0 = _mm_unpacklo_epi8( vw, zero );
398
0
            w1 = _mm_unpackhi_epi8( vw, zero );
399
     
400
0
            r0 = _mm_madd_epi16( vin, w0 );
401
0
            r1 = _mm_madd_epi16( vin, w1 );
402
403
0
            r2 = _mm_hadd_epi32( r2 , _mm_hadd_epi32( r0 , r1));
404
405
0
            r2 = _mm_add_epi32( r2, voffset );
406
0
            r2 = _mm_srai_epi32( r2, MIP_SHIFT_MATRIX );
407
408
0
            r2 = _mm_packs_epi32( r2, r2 );
409
410
0
            r2 = _mm_min_epi16( vibdimax, _mm_max_epi16( zero, r2 ) );
411
412
0
      _vv_storel_epi64( ( __m128i * )&res[0], r2 );
413
0
      res +=4;
414
0
      weight += 32;
415
0
    }
416
417
0
    if( transpose )
418
0
    {
419
0
      if( outputSize == 4)
420
0
      {
421
0
        res -= 16;
422
0
        __m128i vc, vd, va, vb;
423
0
        va = _mm_load_si128( ( const __m128i* ) (res) );
424
0
        vb = _mm_load_si128( ( const __m128i* ) (res+8) );
425
426
0
        vc = _mm_unpacklo_epi16( va, vb );
427
0
        vd = _mm_unpackhi_epi16( va, vb );
428
 
429
0
        va = _mm_unpacklo_epi16( vc, vd );
430
0
        vb = _mm_unpackhi_epi16( vc, vd );
431
 
432
0
        _mm_store_si128( ( __m128i* ) (res), va ); 
433
0
        _mm_store_si128( ( __m128i* ) (res+8), vb ); 
434
0
      }
435
0
      else
436
0
      {
437
0
        res -= 64;
438
0
        __m128i va, vb, vc, vd, ve, vf, vg, vh;
439
440
0
        va = _mm_load_si128( ( const __m128i* ) (res) );
441
0
        vb = _mm_load_si128( ( const __m128i* ) (res+8) );
442
0
        vc = _mm_load_si128( ( const __m128i* ) (res+16) );
443
0
        vd = _mm_load_si128( ( const __m128i* ) (res+24) );
444
0
        ve = _mm_load_si128( ( const __m128i* ) (res+32) );
445
0
        vf = _mm_load_si128( ( const __m128i* ) (res+40) );
446
0
        vg = _mm_load_si128( ( const __m128i* ) (res+48) );
447
0
        vh = _mm_load_si128( ( const __m128i* ) (res+56) );
448
449
0
        __m128i va01b01 = _mm_unpacklo_epi16( va, vb );
450
0
        __m128i va23b23 = _mm_unpackhi_epi16( va, vb );
451
0
        __m128i vc01d01 = _mm_unpacklo_epi16( vc, vd );
452
0
        __m128i vc23d23 = _mm_unpackhi_epi16( vc, vd );
453
0
        __m128i ve01f01 = _mm_unpacklo_epi16( ve, vf );
454
0
        __m128i ve23f23 = _mm_unpackhi_epi16( ve, vf );
455
0
        __m128i vg01h01 = _mm_unpacklo_epi16( vg, vh );
456
0
        __m128i vg23h23 = _mm_unpackhi_epi16( vg, vh );
457
458
0
        va = _mm_unpacklo_epi32( va01b01, vc01d01 );
459
0
        vb = _mm_unpackhi_epi32( va01b01, vc01d01 );
460
0
        vc = _mm_unpacklo_epi32( va23b23, vc23d23 );
461
0
        vd = _mm_unpackhi_epi32( va23b23, vc23d23 );
462
0
        ve = _mm_unpacklo_epi32( ve01f01, vg01h01 );
463
0
        vf = _mm_unpackhi_epi32( ve01f01, vg01h01 );
464
0
        vg = _mm_unpacklo_epi32( ve23f23, vg23h23 );
465
0
        vh = _mm_unpackhi_epi32( ve23f23, vg23h23 );
466
467
0
        va01b01 = _mm_unpacklo_epi64( va, ve );
468
0
        va23b23 = _mm_unpackhi_epi64( va, ve );
469
0
        vc01d01 = _mm_unpacklo_epi64( vb, vf );
470
0
        vc23d23 = _mm_unpackhi_epi64( vb, vf );
471
0
        ve01f01 = _mm_unpacklo_epi64( vc, vg );
472
0
        ve23f23 = _mm_unpackhi_epi64( vc, vg );
473
0
        vg01h01 = _mm_unpacklo_epi64( vd, vh );
474
0
        vg23h23 = _mm_unpackhi_epi64( vd, vh );
475
476
0
        _mm_store_si128( ( __m128i* ) (res),    va01b01 );
477
0
        _mm_store_si128( ( __m128i* ) (res+8) , va23b23 );
478
0
        _mm_store_si128( ( __m128i* ) (res+16), vc01d01 );
479
0
        _mm_store_si128( ( __m128i* ) (res+24), vc23d23 );
480
0
        _mm_store_si128( ( __m128i* ) (res+32), ve01f01 );
481
0
        _mm_store_si128( ( __m128i* ) (res+40), ve23f23 );
482
0
        _mm_store_si128( ( __m128i* ) (res+48), vg01h01 );
483
0
        _mm_store_si128( ( __m128i* ) (res+56), vg23h23 );
484
0
      }
485
0
    }
486
0
  }
487
#endif
488
0
}
Unexecuted instantiation: void vvenc::mipMatrixMul_SSE<(vvenc::x86_simd::X86_VEXT)1, 4u, 4u>(short*, short const*, unsigned char const*, int, int, bool)
Unexecuted instantiation: void vvenc::mipMatrixMul_SSE<(vvenc::x86_simd::X86_VEXT)1, 8u, 4u>(short*, short const*, unsigned char const*, int, int, bool)
Unexecuted instantiation: void vvenc::mipMatrixMul_SSE<(vvenc::x86_simd::X86_VEXT)1, 8u, 8u>(short*, short const*, unsigned char const*, int, int, bool)
Unexecuted instantiation: void vvenc::mipMatrixMul_SSE<(vvenc::x86_simd::X86_VEXT)4, 4u, 4u>(short*, short const*, unsigned char const*, int, int, bool)
Unexecuted instantiation: void vvenc::mipMatrixMul_SSE<(vvenc::x86_simd::X86_VEXT)4, 8u, 4u>(short*, short const*, unsigned char const*, int, int, bool)
Unexecuted instantiation: void vvenc::mipMatrixMul_SSE<(vvenc::x86_simd::X86_VEXT)4, 8u, 8u>(short*, short const*, unsigned char const*, int, int, bool)
489
490
491
template< X86_VEXT vext>
492
void addAvg_SSE( const Pel* src0, const Pel* src1, Pel* dst, int numSamples, unsigned shift, int offset, const ClpRng& clpRng )
493
0
{
494
#if USE_AVX2
495
0
  if( numSamples >= 16 )
496
0
  {
497
0
    const __m256i voffset   = _mm256_set1_epi32( offset );
498
0
    const __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
499
0
    const __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
500
0
    const __m256i vone      = _mm256_set1_epi16( 1 );
501
502
0
    for( int col = 0; col < numSamples; col += 16 )
503
0
    {
504
0
      __m256i vsrc0 = _mm256_load_si256( ( const __m256i* )&src0[col] );
505
0
      __m256i vsrc1 = _mm256_load_si256( ( const __m256i* )&src1[col] );
506
507
0
      __m256i vsum, vdst;
508
0
      vsum = _mm256_unpacklo_epi16    ( vsrc0, vsrc1 );
509
0
      vsum = _mm256_madd_epi16        ( vsum, vone );
510
0
      vsum = _mm256_add_epi32         ( vsum, voffset );
511
0
      vdst = _mm256_srai_epi32        ( vsum, shift );
512
      
513
0
      vsum = _mm256_unpackhi_epi16    ( vsrc0, vsrc1 );
514
0
      vsum = _mm256_madd_epi16        ( vsum, vone );
515
0
      vsum = _mm256_add_epi32         ( vsum, voffset );
516
0
      vsum = _mm256_srai_epi32        ( vsum, shift );
517
518
0
      vdst = _mm256_packs_epi32       ( vdst, vsum );
519
520
0
      vdst = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, vdst ) );
521
0
      _mm256_store_si256( ( __m256i * )&dst[col], vdst );
522
0
    }
523
0
  }
524
0
  else
525
0
#endif
526
0
  if( numSamples >= 8 )
527
0
  {
528
0
    const __m128i vone     = _mm_set1_epi16( 1 );
529
0
    const __m128i voffset  = _mm_set1_epi32( offset );
530
0
    const __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
531
0
    const __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
532
533
0
    for( int col = 0; col < numSamples; col += 8 )
534
0
    {
535
0
      __m128i vsrc0 = _mm_load_si128 ( (const __m128i *)&src0[col] );
536
0
      __m128i vsrc1 = _mm_load_si128 ( (const __m128i *)&src1[col] );
537
538
0
      __m128i vsum, vdst;
539
0
      vsum = _mm_unpacklo_epi16    ( vsrc0, vsrc1 );
540
0
      vsum = _mm_madd_epi16        ( vsum, vone );
541
0
      vsum = _mm_add_epi32         ( vsum, voffset );
542
0
      vdst = _mm_srai_epi32        ( vsum, shift );
543
      
544
0
      vsum = _mm_unpackhi_epi16    ( vsrc0, vsrc1 );
545
0
      vsum = _mm_madd_epi16        ( vsum, vone );
546
0
      vsum = _mm_add_epi32         ( vsum, voffset );
547
0
      vsum = _mm_srai_epi32        ( vsum, shift );
548
549
0
      vdst = _mm_packs_epi32       ( vdst, vsum );
550
551
0
      vdst = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vdst ) );
552
0
      _mm_store_si128( ( __m128i * )&dst[col], vdst );
553
0
    }
554
0
  }
555
0
  else if( numSamples == 4 )
556
0
  {
557
0
    const __m128i vone      = _mm_set1_epi16( 1 );
558
0
    const __m128i vzero     = _mm_setzero_si128();
559
0
    const __m128i voffset   = _mm_set1_epi32( offset );
560
0
    const __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
561
0
    const __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
562
563
0
    __m128i vsum = _vv_loadl_epi64  ( ( const __m128i * )&src0[0] );
564
0
    __m128i vdst = _vv_loadl_epi64  ( ( const __m128i * )&src1[0] );
565
0
    vsum = _mm_unpacklo_epi16    ( vsum, vdst );
566
0
    vsum = _mm_madd_epi16        ( vsum, vone );
567
0
    vsum = _mm_add_epi32         ( vsum, voffset );
568
0
    vsum = _mm_srai_epi32        ( vsum, shift );
569
0
    vdst = _mm_packs_epi32       ( vsum, vzero );
570
571
0
    vdst = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vdst ) );
572
0
    _vv_storel_epi64( ( __m128i * )&dst[0], vdst );
573
0
  }
574
0
  else
575
0
  {
576
0
    THROW( "Unsupported size" );
577
0
  }
578
#if USE_AVX2
579
580
0
  _mm256_zeroupper();
581
0
#endif
582
0
}
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, short const*, short*, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, short const*, short*, int, unsigned int, int, vvenc::ClpRng const&)
583
584
template< X86_VEXT vext>
585
void roundGeo_SSE( const Pel* src, Pel* dst, const int numSamples, unsigned shift, int offset, const ClpRng &clpRng)
586
0
{
587
#if USE_AVX2
588
0
  if( numSamples >= 16 )
589
0
  {
590
0
    __m256i voffset   = _mm256_set1_epi16( offset );
591
0
    __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
592
0
    __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
593
594
0
    for( int col = 0; col < numSamples; col += 16 )
595
0
    {
596
0
      __m256i val = _mm256_load_si256( ( const __m256i* )&src[col] );
597
0
      val = _mm256_adds_epi16        ( val, voffset );
598
0
      val = _mm256_srai_epi16        ( val, shift );
599
0
      val = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, val ) );
600
0
      _mm256_store_si256( ( __m256i * )&dst[col], val );
601
0
    }
602
0
  }
603
0
  else
604
0
#endif
605
0
  {
606
0
    __m128i voffset   = _mm_set1_epi16( offset );
607
0
    __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
608
0
    __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
609
610
0
    if( numSamples >= 8 )
611
0
    {
612
0
      for( int col = 0; col < numSamples; col += 8 )
613
0
      {
614
0
        __m128i val = _mm_load_si128 ( (const __m128i *)&src[col] );
615
0
        val  = _mm_adds_epi16        ( val, voffset );
616
0
        val  = _mm_srai_epi16        ( val, shift );
617
0
        val  = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, val ) );
618
0
        _mm_store_si128( ( __m128i * )&dst[col], val );
619
0
      }
620
0
    }
621
0
    else //if( numSamples == 4 )
622
0
    {
623
0
      __m128i val = _vv_loadl_epi64  ( ( const __m128i * )&src[0] );
624
0
      val = _mm_adds_epi16           ( val, voffset );
625
0
      val = _mm_srai_epi16           ( val, shift );
626
0
      val = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, val ) );
627
0
      _vv_storel_epi64( ( __m128i * )&dst[0], val );
628
0
    }
629
0
  }
630
#if USE_AVX2
631
632
  _mm256_zeroupper();
633
#endif
634
0
}
Unexecuted instantiation: void vvenc::roundGeo_SSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, short*, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::roundGeo_SSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, short*, int, unsigned int, int, vvenc::ClpRng const&)
635
636
template< X86_VEXT vext >
637
void recoCore_SSE( const Pel* src0, const Pel* src1, Pel* dst, int numSamples, const ClpRng& clpRng )
638
0
{
639
#if USE_AVX2
640
0
  if( vext >= AVX2 && numSamples >= 16 )
641
0
  {
642
0
    __m256i vbdmin = _mm256_set1_epi16( clpRng.min() );
643
0
    __m256i vbdmax = _mm256_set1_epi16( clpRng.max() );
644
645
0
    for( int n = 0; n < numSamples; n += 16 )
646
0
    {
647
0
      __m256i vdest = _mm256_load_si256 ( ( const __m256i * )&src0[n] );
648
0
      __m256i vsrc1 = _mm256_load_si256( ( const __m256i * )&src1[n] );
649
650
0
      vdest = _mm256_adds_epi16( vdest, vsrc1 );
651
0
      vdest = _mm256_min_epi16 ( vbdmax, _mm256_max_epi16( vbdmin, vdest ) );
652
653
0
      _mm256_store_si256( ( __m256i * )&dst[n], vdest );
654
0
    }
655
0
  }
656
0
  else
657
0
#endif
658
0
  if( numSamples >= 8 )
659
0
  {
660
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
661
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
662
663
0
    for( int n = 0; n < numSamples; n += 8 )
664
0
    {
665
0
      __m128i vdest = _mm_load_si128 ( ( const __m128i * )&src0[n] );
666
0
      __m128i vsrc1 = _mm_load_si128( ( const __m128i * )&src1[n] );
667
668
0
      vdest = _mm_adds_epi16( vdest, vsrc1 );
669
0
      vdest = _mm_min_epi16 ( vbdmax, _mm_max_epi16( vbdmin, vdest ) );
670
671
0
      _mm_store_si128( ( __m128i * )&dst[n], vdest );
672
0
    }
673
0
  }
674
0
  else
675
0
  {
676
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
677
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
678
679
0
    __m128i vsrc = _vv_loadl_epi64( ( const __m128i * )&src0[0] );
680
0
    __m128i vdst = _vv_loadl_epi64( ( const __m128i * )&src1[0] );
681
682
0
    vdst = _mm_adds_epi16( vdst, vsrc );
683
0
    vdst = _mm_min_epi16 ( vbdmax, _mm_max_epi16( vbdmin, vdst ) );
684
685
0
    _vv_storel_epi64( ( __m128i * )&dst[0], vdst );
686
0
  }
687
#if USE_AVX2
688
689
  _mm256_zeroupper();
690
#endif
691
0
}
Unexecuted instantiation: void vvenc::recoCore_SSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, short const*, short*, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::recoCore_SSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, short const*, short*, int, vvenc::ClpRng const&)
692
693
template<X86_VEXT vext>
694
void copyClip_SSE( const Pel* src, Pel* dst, int numSamples, const ClpRng& clpRng )
695
0
{
696
0
  if( vext >= AVX2 && numSamples >= 16 )
697
0
  {
698
#if USE_AVX2
699
    __m256i vbdmin   = _mm256_set1_epi16( clpRng.min() );
700
    __m256i vbdmax   = _mm256_set1_epi16( clpRng.max() );
701
702
0
    for( int col = 0; col < numSamples; col += 16 )
703
0
    {
704
0
      __m256i val = _mm256_loadu_si256  ( ( const __m256i * ) &src[col] );
705
0
      val = _mm256_min_epi16            ( vbdmax, _mm256_max_epi16( vbdmin, val ) );
706
0
      _mm256_storeu_si256               ( ( __m256i * )&dst[col], val );
707
0
    }
708
#endif
709
0
  }
710
0
  else if(numSamples >= 8 )
711
0
  {
712
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
713
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
714
715
0
    for( int col = 0; col < numSamples; col += 8 )
716
0
    {
717
0
      __m128i val = _mm_loadu_si128 ( ( const __m128i * ) &src[col] );
718
0
      val = _mm_min_epi16           ( vbdmax, _mm_max_epi16( vbdmin, val ) );
719
0
      _mm_storeu_si128              ( ( __m128i * )&dst[col], val );
720
0
    }
721
0
  }
722
0
  else
723
0
  {
724
0
    __m128i vbdmin  = _mm_set1_epi16( clpRng.min() );
725
0
    __m128i vbdmax  = _mm_set1_epi16( clpRng.max() );
726
727
0
    __m128i val;
728
0
    val = _vv_loadl_epi64   ( ( const __m128i * )&src[0] );
729
0
    val = _mm_min_epi16     ( vbdmax, _mm_max_epi16( vbdmin, val ) );
730
0
    _vv_storel_epi64        ( ( __m128i * )&dst[0], val );
731
0
  }
732
#if USE_AVX2
733
734
  _mm256_zeroupper();
735
#endif
736
0
}
Unexecuted instantiation: void vvenc::copyClip_SSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, short*, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::copyClip_SSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, short*, int, vvenc::ClpRng const&)
737
738
739
template< X86_VEXT vext, int W, bool srcAligned >
740
void addAvg_SSE_algn( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, ptrdiff_t dstStride, int width, int height, unsigned shift, int offset, const ClpRng& clpRng )
741
0
{
742
#if USE_AVX2
743
0
  if( W == 16 )
744
0
  {
745
0
    const __m256i voffset   = _mm256_set1_epi32( offset );
746
0
    const __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
747
0
    const __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
748
0
    const __m256i vone      = _mm256_set1_epi16( 1 );
749
750
0
    for( int row = 0; row < height; row++ )
751
0
    {
752
0
      for( int col = 0; col < width; col += 16 )
753
0
      {
754
0
        __m256i vsrc0 = load_aligned_avx2<srcAligned>( ( const void* )&src0[col] );
755
0
        __m256i vsrc1 = load_aligned_avx2<srcAligned>( ( const void* )&src1[col] );
756
757
0
        __m256i vsum, vdst;
758
0
        vsum = _mm256_unpacklo_epi16    ( vsrc0, vsrc1 );
759
0
        vsum = _mm256_madd_epi16        ( vsum, vone );
760
0
        vsum = _mm256_add_epi32         ( vsum, voffset );
761
0
        vdst = _mm256_srai_epi32        ( vsum, shift );
762
        
763
0
        vsum = _mm256_unpackhi_epi16    ( vsrc0, vsrc1 );
764
0
        vsum = _mm256_madd_epi16        ( vsum, vone );
765
0
        vsum = _mm256_add_epi32         ( vsum, voffset );
766
0
        vsum = _mm256_srai_epi32        ( vsum, shift );
767
768
0
        vdst = _mm256_packs_epi32       ( vdst, vsum );
769
770
0
        vdst = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, vdst ) );
771
0
        _mm256_storeu_si256( ( __m256i * )&dst[col], vdst );
772
0
      }
773
774
0
      src0 += src0Stride;
775
0
      src1 += src1Stride;
776
0
      dst  +=  dstStride;
777
0
    }
778
0
  }
779
0
  else
780
0
#endif
781
#if USE_AVX2
782
0
  if( W >= 8 )
783
0
  {
784
0
    __m256i voffset  = _mm256_set1_epi32( offset );
785
0
    __m128i vibdimin = _mm_set1_epi16   ( clpRng.min() );
786
0
    __m128i vibdimax = _mm_set1_epi16   ( clpRng.max() );
787
788
0
    for( int row = 0; row < height; row++ )
789
0
    {
790
0
      for( int col = 0; col < width; col += 8 )
791
0
      {
792
0
        __m256i vsrc0 = _mm256_cvtepi16_epi32( load_aligned<srcAligned>( ( const void* )&src0[col] ) );
793
0
        __m256i vsrc1 = _mm256_cvtepi16_epi32( load_aligned<srcAligned>( ( const void* )&src1[col] ) );
794
795
0
        __m256i
796
0
        vsum = _mm256_add_epi32        ( vsrc0, vsrc1 );
797
0
        vsum = _mm256_add_epi32        ( vsum, voffset );
798
0
        vsum = _mm256_srai_epi32       ( vsum, shift );
799
800
0
        vsum = _mm256_packs_epi32      ( vsum, vsum );
801
0
        vsum = _mm256_permute4x64_epi64( vsum, 0 + ( 2 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
802
803
0
        __m128i
804
0
        xsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, _mm256_castsi256_si128( vsum ) ) );
805
0
        _mm_storeu_si128( ( __m128i * )&dst[col], xsum );
806
0
      }
807
808
0
      src0 += src0Stride;
809
0
      src1 += src1Stride;
810
0
      dst  +=  dstStride;
811
0
    }
812
0
  }
813
#else
814
0
  if( W >= 8 )
815
0
  {
816
0
    const __m128i voffset  = _mm_set1_epi32( offset );
817
0
    const __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
818
0
    const __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
819
0
    const __m128i vone     = _mm_set1_epi16( 1 );
820
821
0
    for( int row = 0; row < height; row++ )
822
0
    {
823
0
      for( int col = 0; col < width; col += 8 )
824
0
      {
825
0
        __m128i vsrc0 = load_aligned<srcAligned>( ( const void* )&src0[col] );
826
0
        __m128i vsrc1 = load_aligned<srcAligned>( ( const void* )&src1[col] );
827
828
0
        __m128i vsum, vdst;
829
0
        vsum = _mm_unpacklo_epi16    ( vsrc0, vsrc1 );
830
0
        vsum = _mm_madd_epi16        ( vsum, vone );
831
0
        vsum = _mm_add_epi32         ( vsum, voffset );
832
0
        vdst = _mm_srai_epi32        ( vsum, shift );
833
        
834
0
        vsum = _mm_unpackhi_epi16    ( vsrc0, vsrc1 );
835
0
        vsum = _mm_madd_epi16        ( vsum, vone );
836
0
        vsum = _mm_add_epi32         ( vsum, voffset );
837
0
        vsum = _mm_srai_epi32        ( vsum, shift );
838
839
0
        vdst = _mm_packs_epi32       ( vdst, vsum );
840
841
0
        vdst = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vdst ) );
842
0
        _mm_storeu_si128( ( __m128i * )&dst[col], vdst );
843
0
      }
844
845
0
      src0 += src0Stride;
846
0
      src1 += src1Stride;
847
0
      dst  +=  dstStride;
848
0
    }
849
0
  }
850
0
#endif
851
0
  else if( W == 4 )
852
0
  {
853
0
    __m128i vzero     = _mm_setzero_si128();
854
0
    __m128i voffset   = _mm_set1_epi32( offset );
855
0
    __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
856
0
    __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
857
858
0
    for( int row = 0; row < height; row++ )
859
0
    {
860
0
      for( int col = 0; col < width; col += 4 )
861
0
      {
862
0
        __m128i vsum = _vv_loadl_epi64  ( ( const __m128i * )&src0[col] );
863
0
        __m128i vdst = _vv_loadl_epi64  ( ( const __m128i * )&src1[col] );
864
0
        vsum = _mm_cvtepi16_epi32       ( vsum );
865
0
        vdst = _mm_cvtepi16_epi32       ( vdst );
866
0
        vsum = _mm_add_epi32            ( vsum, vdst );
867
0
        vsum = _mm_add_epi32            ( vsum, voffset );
868
0
        vsum = _mm_srai_epi32           ( vsum, shift );
869
0
        vsum = _mm_packs_epi32          ( vsum, vzero );
870
871
0
        vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
872
0
        _vv_storel_epi64( ( __m128i * )&dst[col], vsum );
873
0
      }
874
875
0
      src0 += src0Stride;
876
0
      src1 += src1Stride;
877
0
      dst  +=  dstStride;
878
0
    }
879
0
  }
880
0
  else
881
0
  {
882
0
    THROW( "Unsupported size" );
883
0
  }
884
#if USE_AVX2
885
886
0
  _mm256_zeroupper();
887
0
#endif
888
0
}
Unexecuted instantiation: void vvenc::addAvg_SSE_algn<(vvenc::x86_simd::X86_VEXT)1, 4, false>(short const*, int, short const*, int, short*, long, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE_algn<(vvenc::x86_simd::X86_VEXT)1, 8, false>(short const*, int, short const*, int, short*, long, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE_algn<(vvenc::x86_simd::X86_VEXT)1, 16, false>(short const*, int, short const*, int, short*, long, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE_algn<(vvenc::x86_simd::X86_VEXT)4, 4, false>(short const*, int, short const*, int, short*, long, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE_algn<(vvenc::x86_simd::X86_VEXT)4, 8, false>(short const*, int, short const*, int, short*, long, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE_algn<(vvenc::x86_simd::X86_VEXT)4, 16, false>(short const*, int, short const*, int, short*, long, int, int, unsigned int, int, vvenc::ClpRng const&)
889
890
template< X86_VEXT vext, int W >
891
void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, int dstStride, int width, int height, unsigned shift, int offset, const ClpRng& clpRng/*, bool srcAligned*/ )
892
0
{
893
/*
894
  if( srcAligned )
895
  {
896
    addAvg_SSE_algn<vext, W, true>( src0, src0Stride, src1, src1Stride, dst, dstStride, width, height, shift, offset, clpRng );
897
  }
898
  else
899
*/
900
0
  {
901
0
    addAvg_SSE_algn<vext, W, false>( src0, src0Stride, src1, src1Stride, dst, dstStride, width, height, shift, offset, clpRng );
902
0
  }
903
0
}
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)1, 16>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addAvg_SSE<(vvenc::x86_simd::X86_VEXT)4, 16>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, vvenc::ClpRng const&)
904
905
906
template< X86_VEXT vext, int W >
907
void addWghtAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, int dstStride, int width, int height, unsigned shift, int offset, int w0, int w1, const ClpRng& clpRng )
908
0
{
909
0
  if( W == 8 )
910
0
  {
911
#if USE_AVX2
912
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
913
0
    {
914
0
      __m256i voffset  = _mm256_set1_epi32( offset );
915
0
      __m256i vibdimin = _mm256_set1_epi16( clpRng.min() );
916
0
      __m256i vibdimax = _mm256_set1_epi16( clpRng.max() );
917
0
      __m256i vw       = _mm256_unpacklo_epi16( _mm256_set1_epi16( w0 ), _mm256_set1_epi16( w1 ) );
918
919
0
      for( int row = 0; row < height; row++ )
920
0
      {
921
0
        for( int col = 0; col < width; col += 16 )
922
0
        {
923
0
          __m256i vsrc0 = _mm256_loadu_si256( ( const __m256i * )&src0[col] );
924
0
          __m256i vsrc1 = _mm256_loadu_si256( ( const __m256i * )&src1[col] );
925
926
0
          __m256i vtmp, vsum;
927
0
          vsum = _mm256_madd_epi16       ( vw, _mm256_unpacklo_epi16( vsrc0, vsrc1 ) );
928
0
          vsum = _mm256_add_epi32        ( vsum, voffset );
929
0
          vtmp = _mm256_srai_epi32       ( vsum, shift );
930
        
931
0
          vsum = _mm256_madd_epi16       ( vw, _mm256_unpackhi_epi16( vsrc0, vsrc1 ) );
932
0
          vsum = _mm256_add_epi32        ( vsum, voffset );
933
0
          vsum = _mm256_srai_epi32       ( vsum, shift );
934
0
          vsum = _mm256_packs_epi32      ( vtmp, vsum );
935
936
0
          vsum = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, vsum ) );
937
0
          _mm256_storeu_si256( ( __m256i * )&dst[col], vsum );
938
0
        }
939
940
0
        src0 += src0Stride;
941
0
        src1 += src1Stride;
942
0
        dst  +=  dstStride;
943
0
      }
944
0
    }
945
0
    else
946
0
#endif
947
0
    {
948
0
      __m128i voffset  = _mm_set1_epi32( offset );
949
0
      __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
950
0
      __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
951
0
      __m128i vw       = _mm_unpacklo_epi16( _mm_set1_epi16( w0 ), _mm_set1_epi16( w1 ) );
952
953
0
      for( int row = 0; row < height; row++ )
954
0
      {
955
0
        for( int col = 0; col < width; col += 8 )
956
0
        {
957
0
          __m128i vsrc0 = _mm_loadu_si128( ( const __m128i * )&src0[col] );
958
0
          __m128i vsrc1 = _mm_loadu_si128( ( const __m128i * )&src1[col] );
959
960
0
          __m128i vtmp, vsum;
961
0
          vsum = _mm_madd_epi16       ( vw, _mm_unpacklo_epi16( vsrc0, vsrc1 ) );
962
0
          vsum = _mm_add_epi32        ( vsum, voffset );
963
0
          vtmp = _mm_srai_epi32       ( vsum, shift );
964
        
965
0
          vsum = _mm_madd_epi16       ( vw, _mm_unpackhi_epi16( vsrc0, vsrc1 ) );
966
0
          vsum = _mm_add_epi32        ( vsum, voffset );
967
0
          vsum = _mm_srai_epi32       ( vsum, shift );
968
0
          vsum = _mm_packs_epi32      ( vtmp, vsum );
969
970
0
          vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
971
0
          _mm_storeu_si128( ( __m128i * )&dst[col], vsum );
972
0
        }
973
974
0
        src0 += src0Stride;
975
0
        src1 += src1Stride;
976
0
        dst  +=  dstStride;
977
0
      }
978
0
    }
979
0
  }
980
0
  else if( W == 4 )
981
0
  {
982
0
    __m128i vzero     = _mm_setzero_si128();
983
0
    __m128i voffset   = _mm_set1_epi32( offset );
984
0
    __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
985
0
    __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
986
0
    __m128i vw        = _mm_unpacklo_epi16( _mm_set1_epi16( w0 ), _mm_set1_epi16( w1 ) );
987
988
0
    for( int row = 0; row < height; row++ )
989
0
    {
990
0
      for( int col = 0; col < width; col += 4 )
991
0
      {
992
0
        __m128i vsum = _vv_loadl_epi64  ( ( const __m128i * )&src0[col] );
993
0
        __m128i vdst = _vv_loadl_epi64  ( ( const __m128i * )&src1[col] );
994
0
        vsum = _mm_madd_epi16           ( vw, _mm_unpacklo_epi16( vsum, vdst ) );
995
0
        vsum = _mm_add_epi32            ( vsum, voffset );
996
0
        vsum = _mm_srai_epi32           ( vsum, shift );
997
0
        vsum = _mm_packs_epi32          ( vsum, vzero );
998
999
0
        vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
1000
0
        _vv_storel_epi64( ( __m128i * )&dst[col], vsum );
1001
0
      }
1002
1003
0
      src0 += src0Stride;
1004
0
      src1 += src1Stride;
1005
0
      dst  +=  dstStride;
1006
0
    }
1007
0
  }
1008
0
  else
1009
0
  {
1010
0
    THROW( "Unsupported size" );
1011
0
  }
1012
#if USE_AVX2
1013
1014
0
  _mm256_zeroupper();
1015
0
#endif
1016
0
}
Unexecuted instantiation: void vvenc::addWghtAvg_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addWghtAvg_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addWghtAvg_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::addWghtAvg_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short const*, int, short*, int, int, int, unsigned int, int, int, int, vvenc::ClpRng const&)
1017
1018
template< X86_VEXT vext >
1019
void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit)
1020
0
{
1021
0
  CHECKD(size % 16 != 0, "Size must be multiple of 16!");
1022
#ifdef USE_AVX512
1023
  if (vext >= AVX512 && size >= 16)
1024
  {
1025
    __m512i dMvMin = _mm256_set1_epi32(-dmvLimit);
1026
    __m512i dMvMax = _mm256_set1_epi32(dmvLimit);
1027
    __m512i nOffset = _mm512_set1_epi32((1 << (nShift - 1)));
1028
    __m512i vones = _mm512_set1_epi32(1);
1029
    __m512i vzero = _mm512_setzero_si512();
1030
    for (int i = 0; i < size; i += 16, v += 16)
1031
    {
1032
      __m512i src = _mm512_loadu_si512(v);
1033
      __mmask16 mask = _mm512_cmpge_epi32_mask(src, vzero);
1034
      src = __mm512_add_epi32(src, nOffset);
1035
      __mm512i dst = _mm512_srai_epi32(_mm512_mask_sub_epi32(src, mask, src, vones), nShift);
1036
      dst = _mm512_min_epi32(dMvMax, _mm512_max_epi32(dMvMin, dst));
1037
      _mm512_storeu_si512(v, dst);
1038
    }
1039
  }
1040
  else
1041
#endif
1042
#ifdef USE_AVX2
1043
0
  if (vext >= AVX2 && size >= 8)
1044
0
  {
1045
0
    __m256i dMvMin = _mm256_set1_epi32(-dmvLimit);
1046
0
    __m256i dMvMax = _mm256_set1_epi32(dmvLimit);
1047
0
    __m256i nOffset = _mm256_set1_epi32(1 << (nShift - 1));
1048
0
    __m256i vzero = _mm256_setzero_si256();
1049
0
    for (int i = 0; i < size; i += 8, v += 8)
1050
0
    {
1051
0
      __m256i src = _mm256_lddqu_si256((__m256i*)v);
1052
0
      __m256i of  = _mm256_cmpgt_epi32(src, vzero);
1053
0
      __m256i dst = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(src, nOffset), of), nShift);
1054
0
      dst = _mm256_min_epi32(dMvMax, _mm256_max_epi32(dMvMin, dst));
1055
0
      _mm256_storeu_si256((__m256i*)v, dst);
1056
0
    }
1057
0
  }
1058
0
  else
1059
0
#endif
1060
0
  {
1061
0
    __m128i dMvMin = _mm_set1_epi32(-dmvLimit);
1062
0
    __m128i dMvMax = _mm_set1_epi32(dmvLimit);
1063
0
    __m128i nOffset = _mm_set1_epi32((1 << (nShift - 1)));
1064
0
    __m128i vzero = _mm_setzero_si128();
1065
0
    for (int i = 0; i < size; i += 4, v += 4)
1066
0
    {
1067
0
      __m128i src = _mm_loadu_si128((__m128i*)v);
1068
0
      __m128i of  = _mm_cmpgt_epi32(src, vzero);
1069
0
      __m128i dst = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(src, nOffset), of), nShift);
1070
0
      dst = _mm_min_epi32(dMvMax, _mm_max_epi32(dMvMin, dst));
1071
0
      _mm_storeu_si128((__m128i*)v, dst);
1072
0
    }
1073
0
  }
1074
0
}
Unexecuted instantiation: void vvenc::roundIntVector_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int*, int, unsigned int, int)
Unexecuted instantiation: void vvenc::roundIntVector_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int*, int, unsigned int, int)
1075
1076
1077
template< X86_VEXT vext, int W >
1078
void reco_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, int dstStride, int width, int height, const ClpRng& clpRng )
1079
0
{
1080
0
  if( W == 8 )
1081
0
  {
1082
#if USE_AVX2
1083
0
    if( vext >= AVX2 && ( width & 15 ) == 0 )
1084
0
    {
1085
0
      __m256i vbdmin = _mm256_set1_epi16( clpRng.min() );
1086
0
      __m256i vbdmax = _mm256_set1_epi16( clpRng.max() );
1087
1088
0
      for( int row = 0; row < height; row++ )
1089
0
      {
1090
0
        for( int col = 0; col < width; col += 16 )
1091
0
        {
1092
0
          __m256i vdest = _mm256_loadu_si256 ( ( const __m256i * )&src0[col] );
1093
0
          __m256i vsrc1 = _mm256_loadu_si256( ( const __m256i * )&src1[col] );
1094
1095
0
          vdest = _mm256_adds_epi16( vdest, vsrc1 );
1096
0
          vdest = _mm256_min_epi16 ( vbdmax, _mm256_max_epi16( vbdmin, vdest ) );
1097
1098
0
          _mm256_storeu_si256( ( __m256i * )&dst[col], vdest );
1099
0
        }
1100
1101
0
        src0 += src0Stride;
1102
0
        src1 += src1Stride;
1103
0
        dst  += dstStride;
1104
0
      }
1105
0
    }
1106
0
    else
1107
0
#endif
1108
0
    {
1109
0
      __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
1110
0
      __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
1111
1112
0
      for( int row = 0; row < height; row++ )
1113
0
      {
1114
0
        for( int col = 0; col < width; col += 8 )
1115
0
        {
1116
0
          __m128i vdest = _mm_loadu_si128 ( ( const __m128i * )&src0[col] );
1117
0
          __m128i vsrc1 = _mm_loadu_si128( ( const __m128i * )&src1[col] );
1118
1119
0
          vdest = _mm_adds_epi16( vdest, vsrc1 );
1120
0
          vdest = _mm_min_epi16 ( vbdmax, _mm_max_epi16( vbdmin, vdest ) );
1121
1122
0
          _mm_storeu_si128( ( __m128i * )&dst[col], vdest );
1123
0
        }
1124
1125
0
        src0 += src0Stride;
1126
0
        src1 += src1Stride;
1127
0
        dst  += dstStride;
1128
0
      }
1129
0
    }
1130
0
  }
1131
0
  else if( W == 4 )
1132
0
  {
1133
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
1134
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
1135
1136
0
    for( int row = 0; row < height; row++ )
1137
0
    {
1138
0
      for( int col = 0; col < width; col += 4 )
1139
0
      {
1140
0
        __m128i vsrc = _vv_loadl_epi64( ( const __m128i * )&src0[col] );
1141
0
        __m128i vdst = _vv_loadl_epi64( ( const __m128i * )&src1[col] );
1142
1143
0
        vdst = _mm_adds_epi16( vdst, vsrc );
1144
0
        vdst = _mm_min_epi16 ( vbdmax, _mm_max_epi16( vbdmin, vdst ) );
1145
1146
0
        _vv_storel_epi64( ( __m128i * )&dst[col], vdst );
1147
0
      }
1148
1149
0
      src0 += src0Stride;
1150
0
      src1 += src1Stride;
1151
0
      dst  +=  dstStride;
1152
0
    }
1153
0
  }
1154
0
  else
1155
0
  {
1156
0
    THROW( "Unsupported size" );
1157
0
  }
1158
#if USE_AVX2
1159
1160
0
  _mm256_zeroupper();
1161
0
#endif
1162
0
}
Unexecuted instantiation: void vvenc::reco_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short const*, int, short*, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::reco_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short const*, int, short*, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::reco_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short const*, int, short*, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::reco_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short const*, int, short*, int, int, int, vvenc::ClpRng const&)
1163
1164
template<X86_VEXT vext>
1165
void copyBufferSimd( const char* src, int srcStride, char* dst, int dstStride, int width, int height)
1166
0
{
1167
0
  _mm_prefetch( src            , _MM_HINT_T0 );
1168
0
  _mm_prefetch( src + srcStride, _MM_HINT_T0 );
1169
1170
0
  if( width == srcStride && width == dstStride )
1171
0
  {
1172
0
    memcpy( dst, src, width * height );
1173
0
  }
1174
0
  else
1175
0
  {
1176
0
    while( height-- )
1177
0
    {
1178
0
      const char* nextSrcLine = src + srcStride;
1179
0
            char* nextDstLine = dst + dstStride;
1180
1181
0
      _mm_prefetch( nextSrcLine, _MM_HINT_T0 );
1182
1183
0
      memcpy( dst, src, width );
1184
1185
0
      src = nextSrcLine;
1186
0
      dst = nextDstLine;
1187
0
    }
1188
0
  }
1189
0
}
Unexecuted instantiation: void vvenc::copyBufferSimd<(vvenc::x86_simd::X86_VEXT)1>(char const*, int, char*, int, int, int)
Unexecuted instantiation: void vvenc::copyBufferSimd<(vvenc::x86_simd::X86_VEXT)4>(char const*, int, char*, int, int, int)
1190
1191
#if ENABLE_SIMD_OPT_BCW
1192
template< X86_VEXT vext, int W >
1193
void removeHighFreq_SSE(int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int width, int height)
1194
0
{
1195
0
 if (W == 8)
1196
0
 {
1197
   // TODO: AVX2 impl
1198
0
   {
1199
0
     for (int row = 0; row < height; row++)
1200
0
     {
1201
0
       for (int col = 0; col < width; col += 8)
1202
0
       {
1203
0
         __m128i vsrc0 = _mm_load_si128((const __m128i *)&src0[col]);
1204
0
         __m128i vsrc1 = _mm_load_si128((const __m128i *)&src1[col]);
1205
1206
0
         vsrc0 = _mm_sub_epi16(_mm_slli_epi16(vsrc0, 1), vsrc1);
1207
0
         _mm_store_si128((__m128i *)&src0[col], vsrc0);
1208
0
       }
1209
1210
0
       src0 += src0Stride;
1211
0
       src1 += src1Stride;
1212
0
     }
1213
0
   }
1214
0
 }
1215
0
 else if (W == 4)
1216
0
 {
1217
0
   for (int row = 0; row < height; row += 2)
1218
0
   {
1219
0
     __m128i vsrc0 = _vv_loadl_epi64((const __m128i *)src0);
1220
0
     __m128i vsrc1 = _vv_loadl_epi64((const __m128i *)src1);
1221
0
     __m128i vsrc0_2 = _vv_loadl_epi64((const __m128i *)(src0 + src0Stride));
1222
0
     __m128i vsrc1_2 = _vv_loadl_epi64((const __m128i *)(src1 + src1Stride));
1223
1224
0
     vsrc0 = _mm_unpacklo_epi64(vsrc0, vsrc0_2);
1225
0
     vsrc1 = _mm_unpacklo_epi64(vsrc1, vsrc1_2);
1226
1227
0
     vsrc0 = _mm_sub_epi16(_mm_slli_epi16(vsrc0, 1), vsrc1);
1228
0
     _vv_storel_epi64((__m128i *)src0, vsrc0);
1229
0
     _vv_storel_epi64((__m128i *)(src0 + src0Stride), _mm_unpackhi_epi64(vsrc0, vsrc0));
1230
1231
0
     src0 += (src0Stride << 1);
1232
0
     src1 += (src1Stride << 1);
1233
0
   }
1234
0
 }
1235
0
 else
1236
0
 {
1237
0
   THROW("Unsupported size");
1238
0
 }
1239
0
}
Unexecuted instantiation: void vvenc::removeHighFreq_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short*, int, short const*, int, int, int)
Unexecuted instantiation: void vvenc::removeHighFreq_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short*, int, short const*, int, int, int)
Unexecuted instantiation: void vvenc::removeHighFreq_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short*, int, short const*, int, int, int)
Unexecuted instantiation: void vvenc::removeHighFreq_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short*, int, short const*, int, int, int)
1240
#endif
1241
1242
template<X86_VEXT vext, int W>
1243
void sub_SSE( const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel* dest, int destStride, int width, int height )
1244
0
{
1245
0
  if( W == 8 )
1246
0
  {
1247
0
    while( height-- )
1248
0
    {
1249
0
      for( int x = 0; x < width; x += 8 )
1250
0
      {
1251
0
        __m128i vsrc0 = _mm_load_si128( ( const __m128i* ) &src0[x] );
1252
0
        __m128i vsrc1 = _mm_load_si128( ( const __m128i* ) &src1[x] );
1253
0
        __m128i vdest = _mm_sub_epi16 ( vsrc0, vsrc1 );
1254
1255
0
        _mm_storeu_si128( ( __m128i* ) &dest[x], vdest );
1256
0
      }
1257
1258
0
      src0 += src0Stride;
1259
0
      src1 += src1Stride;
1260
0
      dest += destStride;
1261
0
    }
1262
0
  }
1263
0
  else if( W == 4 )
1264
0
  {
1265
0
    while( height-- )
1266
0
    {
1267
0
      for( int x = 0; x < width; x += 8 )
1268
0
      {
1269
0
        __m128i vsrc0 = _vv_loadl_epi64( ( const __m128i* ) &src0[x] );
1270
0
        __m128i vsrc1 = _vv_loadl_epi64( ( const __m128i* ) &src1[x] );
1271
0
        __m128i vdest = _mm_sub_epi16  ( vsrc0, vsrc1 );
1272
1273
0
        _vv_storel_epi64( ( __m128i* ) &dest[x], vdest );
1274
0
      }
1275
1276
0
      src0 += src0Stride;
1277
0
      src1 += src1Stride;
1278
0
      dest += destStride;
1279
0
    }
1280
0
  }
1281
0
  else
1282
0
  {
1283
0
    THROW("Unsupported size");
1284
0
  }
1285
0
}
Unexecuted instantiation: void vvenc::sub_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short const*, int, short*, int, int, int)
Unexecuted instantiation: void vvenc::sub_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short const*, int, short*, int, int, int)
Unexecuted instantiation: void vvenc::sub_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short const*, int, short*, int, int, int)
Unexecuted instantiation: void vvenc::sub_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short const*, int, short*, int, int, int)
1286
1287
template<bool doShift, bool shiftR, typename T> static inline void do_shift( T &vreg, int num );
1288
#if USE_AVX2
1289
0
template<> inline void do_shift<true,  true , __m256i>( __m256i &vreg, int num ) { vreg = _mm256_sra_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
1290
0
template<> inline void do_shift<true,  false, __m256i>( __m256i &vreg, int num ) { vreg = _mm256_sll_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
1291
0
template<> inline void do_shift<false, true , __m256i>( __m256i &vreg, int num ) { }
1292
0
template<> inline void do_shift<false, false, __m256i>( __m256i &vreg, int num ) { }
1293
#endif
1294
0
template<> inline void do_shift<true,  true , __m128i>( __m128i &vreg, int num ) { vreg = _mm_sra_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_shift<true, true, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_shift<true, true, long long __vector(2)>(long long __vector(2)&, int)
1295
0
template<> inline void do_shift<true,  false, __m128i>( __m128i &vreg, int num ) { vreg = _mm_sll_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_shift<true, false, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_shift<true, false, long long __vector(2)>(long long __vector(2)&, int)
1296
0
template<> inline void do_shift<false, true , __m128i>( __m128i &vreg, int num ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_shift<false, true, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_shift<false, true, long long __vector(2)>(long long __vector(2)&, int)
1297
0
template<> inline void do_shift<false, false, __m128i>( __m128i &vreg, int num ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_shift<false, false, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_shift<false, false, long long __vector(2)>(long long __vector(2)&, int)
1298
1299
template<bool mult, typename T> static inline void do_mult( T& vreg, T& vmult );
1300
0
template<> inline void do_mult<false, __m128i>( __m128i&, __m128i& ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_mult<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_mult<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
1301
#if USE_AVX2
1302
0
template<> inline void do_mult<false, __m256i>( __m256i&, __m256i& ) { }
1303
#endif
1304
0
template<> inline void do_mult<true,   __m128i>( __m128i& vreg, __m128i& vmult ) { vreg = _mm_mullo_epi32   ( vreg, vmult ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_mult<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_mult<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
1305
#if USE_AVX2
1306
0
template<> inline void do_mult<true,   __m256i>( __m256i& vreg, __m256i& vmult ) { vreg = _mm256_mullo_epi32( vreg, vmult ); }
1307
#endif
1308
1309
template<bool add, typename T> static inline void do_add( T& vreg, T& vadd );
1310
0
template<> inline void do_add<false, __m128i>( __m128i&, __m128i& ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_add<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_add<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
1311
#if USE_AVX2
1312
0
template<> inline void do_add<false, __m256i>( __m256i&, __m256i& ) { }
1313
#endif
1314
0
template<> inline void do_add<true,  __m128i>( __m128i& vreg, __m128i& vadd ) { vreg = _mm_add_epi32( vreg, vadd ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_add<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_add<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
1315
#if USE_AVX2
1316
0
template<> inline void do_add<true,  __m256i>( __m256i& vreg, __m256i& vadd ) { vreg = _mm256_add_epi32( vreg, vadd ); }
1317
#endif
1318
1319
template<bool clip, typename T> static inline void do_clip( T& vreg, T& vbdmin, T& vbdmax );
1320
0
template<> inline void do_clip<false, __m128i>( __m128i&, __m128i&, __m128i& ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_clip<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_clip<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
1321
#if USE_AVX2
1322
0
template<> inline void do_clip<false, __m256i>( __m256i&, __m256i&, __m256i& ) { }
1323
#endif
1324
0
template<> inline void do_clip<true,  __m128i>( __m128i& vreg, __m128i& vbdmin, __m128i& vbdmax ) { vreg = _mm_min_epi16   ( vbdmax, _mm_max_epi16   ( vbdmin, vreg ) ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvenc::do_clip<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvenc::do_clip<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
1325
#if USE_AVX2
1326
0
template<> inline void do_clip<true,  __m256i>( __m256i& vreg, __m256i& vbdmin, __m256i& vbdmax ) { vreg = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, vreg ) ); }
1327
#endif
1328
1329
1330
template<X86_VEXT vext, int W, bool doAdd, bool mult, bool doShift, bool shiftR, bool clip>
1331
void linTf_SSE( const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng )
1332
0
{
1333
0
  if( vext >= AVX2 && ( width & 7 ) == 0 && W == 8 )
1334
0
  {
1335
#if USE_AVX2
1336
    __m256i vzero    = _mm256_setzero_si256();
1337
    __m256i vbdmin   = _mm256_set1_epi16( clpRng.min() );
1338
    __m256i vbdmax   = _mm256_set1_epi16( clpRng.max() );
1339
    __m256i voffset  = _mm256_set1_epi32( offset );
1340
    __m256i vscale   = _mm256_set1_epi32( scale );
1341
1342
0
    for( int row = 0; row < height; row++ )
1343
0
    {
1344
0
      for( int col = 0; col < width; col += 8 )
1345
0
      {
1346
0
        __m256i val;
1347
0
        val = _mm256_cvtepi16_epi32       (  _mm_loadu_si128( ( const __m128i * )&src[col] ) );
1348
0
        do_mult<mult, __m256i>            ( val, vscale );
1349
0
        do_shift<doShift, shiftR, __m256i>( val, shift );
1350
0
        do_add<doAdd, __m256i>            ( val, voffset );
1351
0
        val = _mm256_packs_epi32          ( val, vzero );
1352
0
        do_clip<clip, __m256i>            ( val, vbdmin, vbdmax );
1353
0
        val = _mm256_permute4x64_epi64    ( val, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 1 << 6 ) );
1354
1355
0
        _mm_storeu_si128                  ( ( __m128i * )&dst[col], _mm256_castsi256_si128( val ) );
1356
0
      }
1357
1358
0
      src += srcStride;
1359
0
      dst += dstStride;
1360
0
    }
1361
#endif
1362
0
  }
1363
0
  else
1364
0
  {
1365
0
    __m128i vzero   = _mm_setzero_si128();
1366
0
    __m128i vbdmin  = _mm_set1_epi16   ( clpRng.min() );
1367
0
    __m128i vbdmax  = _mm_set1_epi16   ( clpRng.max() );
1368
0
    __m128i voffset = _mm_set1_epi32   ( offset );
1369
0
    __m128i vscale  = _mm_set1_epi32   ( scale );
1370
1371
0
    for( int row = 0; row < height; row++ )
1372
0
    {
1373
0
      for( int col = 0; col < width; col += 4 )
1374
0
      {
1375
0
        __m128i val;
1376
0
        val = _vv_loadl_epi64             ( ( const __m128i * )&src[col] );
1377
0
        val = _mm_cvtepi16_epi32          ( val );
1378
0
        do_mult<mult, __m128i>            ( val, vscale );
1379
0
        do_shift<doShift, shiftR, __m128i>( val, shift );
1380
0
        do_add<doAdd, __m128i>            ( val, voffset );
1381
0
        val = _mm_packs_epi32             ( val, vzero );
1382
0
        do_clip<clip, __m128i>            ( val, vbdmin, vbdmax );
1383
1384
0
        _vv_storel_epi64                  ( ( __m128i * )&dst[col], val );
1385
0
      }
1386
1387
0
      src += srcStride;
1388
0
      dst += dstStride;
1389
0
    }
1390
0
  }
1391
#if USE_AVX2
1392
1393
  _mm256_zeroupper();
1394
#endif
1395
0
}
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, true, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 4, false, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, true, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)1, 8, false, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, true, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 4, false, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, true, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, true, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, true, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, true, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, true, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, false, true, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, false, true, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, false, false, true, true>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::linTf_SSE<(vvenc::x86_simd::X86_VEXT)4, 8, false, false, false, true, false>(short const*, int, short*, int, int, int, int, int, int, vvenc::ClpRng const&)
1396
1397
template<X86_VEXT vext, int W>
1398
void linTf_SSE_entry( const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, int scale, unsigned shift, int offset, const ClpRng& clpRng, bool clip )
1399
0
{
1400
0
  int fn = ( offset == 0 ? 16 : 0 ) + ( scale == 1 ? 8 : 0 ) + ( shift == 0 ? 4 : 0 ) /*+ ( shift < 0 ? 2 : 0 )*/ + ( !clip ? 1 : 0 );
1401
1402
0
  switch( fn )
1403
0
  {
1404
0
  case  0: linTf_SSE<vext, W, true,  true,  true,  true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1405
0
  case  1: linTf_SSE<vext, W, true,  true,  true,  true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1406
//  case  2: linTf_SSE<vext, W, true,  true,  true,  false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1407
//  case  3: linTf_SSE<vext, W, true,  true,  true,  false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1408
0
  case  4: linTf_SSE<vext, W, true,  true,  false, true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1409
0
  case  5: linTf_SSE<vext, W, true,  true,  false, true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1410
//  case  6: linTf_SSE<vext, W, true,  true,  false, false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1411
//  case  7: linTf_SSE<vext, W, true,  true,  false, false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1412
0
  case  8: linTf_SSE<vext, W, true,  false, true,  true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1413
0
  case  9: linTf_SSE<vext, W, true,  false, true,  true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1414
//  case 10: linTf_SSE<vext, W, true,  false, true,  false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1415
//  case 11: linTf_SSE<vext, W, true,  false, true,  false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1416
0
  case 12: linTf_SSE<vext, W, true,  false, false, true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1417
0
  case 13: linTf_SSE<vext, W, true,  false, false, true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1418
//  case 14: linTf_SSE<vext, W, true,  false, false, false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1419
//  case 15: linTf_SSE<vext, W, true,  false, false, false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1420
0
  case 16: linTf_SSE<vext, W, false, true,  true,  true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1421
0
  case 17: linTf_SSE<vext, W, false, true,  true,  true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1422
//  case 18: linTf_SSE<vext, W, false, true,  true,  false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1423
//  case 19: linTf_SSE<vext, W, false, true,  true,  false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1424
0
  case 20: linTf_SSE<vext, W, false, true,  false, true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1425
0
  case 21: linTf_SSE<vext, W, false, true,  false, true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1426
//  case 22: linTf_SSE<vext, W, false, true,  false, false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1427
//  case 23: linTf_SSE<vext, W, false, true,  false, false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1428
0
  case 24: linTf_SSE<vext, W, false, false, true,  true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1429
0
  case 25: linTf_SSE<vext, W, false, false, true,  true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1430
//  case 26: linTf_SSE<vext, W, false, false, true,  false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1431
//  case 27: linTf_SSE<vext, W, false, false, true,  false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1432
0
  case 28: linTf_SSE<vext, W, false, false, false, true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1433
0
  case 29: linTf_SSE<vext, W, false, false, false, true,  false>( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng ); break;
1434
//  case 30: linTf_SSE<vext, W, false, false, false, false, true >( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1435
//  case 31: linTf_SSE<vext, W, false, false, false, false, false>( src, srcStride, dst, dstStride, width, height, scale, -shift, offset, clpRng ); break;
1436
0
  default:
1437
0
    THROW( "Unknown parametrization of the linear transformation" );
1438
0
    break;
1439
0
  }
1440
0
}
Unexecuted instantiation: void vvenc::linTf_SSE_entry<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short*, int, int, int, int, unsigned int, int, vvenc::ClpRng const&, bool)
Unexecuted instantiation: void vvenc::linTf_SSE_entry<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short*, int, int, int, int, unsigned int, int, vvenc::ClpRng const&, bool)
Unexecuted instantiation: void vvenc::linTf_SSE_entry<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short*, int, int, int, int, unsigned int, int, vvenc::ClpRng const&, bool)
Unexecuted instantiation: void vvenc::linTf_SSE_entry<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short*, int, int, int, int, unsigned int, int, vvenc::ClpRng const&, bool)
1441
1442
template<X86_VEXT vext, int W>
1443
void copyClip_SSE( const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, const ClpRng& clpRng )
1444
0
{
1445
0
  if( vext >= AVX2 && ( width & 15 ) == 0 && W == 8 )
1446
0
  {
1447
#if USE_AVX2
1448
    __m256i vbdmin   = _mm256_set1_epi16( clpRng.min() );
1449
    __m256i vbdmax   = _mm256_set1_epi16( clpRng.max() );
1450
1451
0
    for( int row = 0; row < height; row++ )
1452
0
    {
1453
0
      for( int col = 0; col < width; col += 16 )
1454
0
      {
1455
0
        __m256i val = _mm256_loadu_si256  ( ( const __m256i * ) &src[col] );
1456
0
        val = _mm256_min_epi16            ( vbdmax, _mm256_max_epi16( vbdmin, val ) );
1457
0
        _mm256_storeu_si256               ( ( __m256i * )&dst[col], val );
1458
0
      }
1459
1460
0
      src += srcStride;
1461
0
      dst += dstStride;
1462
0
    }
1463
#endif
1464
0
  }
1465
0
  else if( W == 8 )
1466
0
  {
1467
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
1468
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
1469
1470
0
    for( int row = 0; row < height; row++ )
1471
0
    {
1472
0
      for( int col = 0; col < width; col += 8 )
1473
0
      {
1474
0
        __m128i val = _mm_loadu_si128 ( ( const __m128i * ) &src[col] );
1475
0
        val = _mm_min_epi16           ( vbdmax, _mm_max_epi16( vbdmin, val ) );
1476
0
        _mm_storeu_si128              ( ( __m128i * )&dst[col], val );
1477
0
      }
1478
1479
0
      src += srcStride;
1480
0
      dst += dstStride;
1481
0
    }
1482
0
  }
1483
0
  else
1484
0
  {
1485
0
    __m128i vbdmin  = _mm_set1_epi16( clpRng.min() );
1486
0
    __m128i vbdmax  = _mm_set1_epi16( clpRng.max() );
1487
1488
0
    for( int row = 0; row < height; row++ )
1489
0
    {
1490
0
      for( int col = 0; col < width; col += 4 )
1491
0
      {
1492
0
        __m128i val;
1493
0
        val = _vv_loadl_epi64   ( ( const __m128i * )&src[col] );
1494
0
        val = _mm_min_epi16     ( vbdmax, _mm_max_epi16( vbdmin, val ) );
1495
0
        _vv_storel_epi64        ( ( __m128i * )&dst[col], val );
1496
0
      }
1497
1498
0
      src += srcStride;
1499
0
      dst += dstStride;
1500
0
    }
1501
0
  }
1502
#if USE_AVX2
1503
1504
  _mm256_zeroupper();
1505
#endif
1506
0
}
Unexecuted instantiation: void vvenc::copyClip_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short*, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::copyClip_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short*, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::copyClip_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short*, int, int, int, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::copyClip_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short*, int, int, int, vvenc::ClpRng const&)
1507
1508
template<X86_VEXT vext, int W>
1509
void transposeNxN_SSE( const Pel* src, int srcStride, Pel* dst, int dstStride )
1510
0
{
1511
0
  if( W == 4 )
1512
0
  {
1513
0
    __m128i va, vb, vc, vd;
1514
1515
0
    va = _vv_loadl_epi64( ( const __m128i* ) src ); src += srcStride;
1516
0
    vb = _vv_loadl_epi64( ( const __m128i* ) src ); src += srcStride;
1517
0
    vc = _vv_loadl_epi64( ( const __m128i* ) src ); src += srcStride;
1518
0
    vd = _vv_loadl_epi64( ( const __m128i* ) src );
1519
1520
0
    __m128i va01b01 = _mm_unpacklo_epi16( va,      vb );
1521
0
    __m128i va23b23 = _mm_unpackhi_epi64( va01b01, vb );
1522
0
    __m128i vc01d01 = _mm_unpacklo_epi16( vc,      vd );
1523
0
    __m128i vc23d23 = _mm_unpackhi_epi64( vc01d01, vd );
1524
1525
0
    va = _mm_unpacklo_epi32( va01b01, vc01d01 );
1526
0
    vb = _mm_unpackhi_epi64( va,      va );
1527
0
    vc = _mm_unpacklo_epi32( va23b23, vc23d23 );
1528
0
    vd = _mm_unpackhi_epi64( vc,      vc );
1529
1530
0
    _vv_storel_epi64( ( __m128i* ) dst, va ); dst += dstStride;
1531
0
    _vv_storel_epi64( ( __m128i* ) dst, vb ); dst += dstStride;
1532
0
    _vv_storel_epi64( ( __m128i* ) dst, vc ); dst += dstStride;
1533
0
    _vv_storel_epi64( ( __m128i* ) dst, vd );
1534
0
  }
1535
0
  else if( W == 8 )
1536
0
  {
1537
  
1538
0
  __m128i va, vb, vc, vd, ve, vf, vg, vh;
1539
1540
0
    va = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1541
0
    vb = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1542
0
    vc = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1543
0
    vd = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1544
0
    ve = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1545
0
    vf = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1546
0
    vg = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
1547
0
    vh = _mm_loadu_si128( ( const __m128i* ) src );
1548
1549
0
    __m128i va01b01 = _mm_unpacklo_epi16( va, vb );
1550
0
    __m128i va23b23 = _mm_unpackhi_epi16( va, vb );
1551
0
    __m128i vc01d01 = _mm_unpacklo_epi16( vc, vd );
1552
0
    __m128i vc23d23 = _mm_unpackhi_epi16( vc, vd );
1553
0
    __m128i ve01f01 = _mm_unpacklo_epi16( ve, vf );
1554
0
    __m128i ve23f23 = _mm_unpackhi_epi16( ve, vf );
1555
0
    __m128i vg01h01 = _mm_unpacklo_epi16( vg, vh );
1556
0
    __m128i vg23h23 = _mm_unpackhi_epi16( vg, vh );
1557
1558
0
    va = _mm_unpacklo_epi32( va01b01, vc01d01 );
1559
0
    vb = _mm_unpackhi_epi32( va01b01, vc01d01 );
1560
0
    vc = _mm_unpacklo_epi32( va23b23, vc23d23 );
1561
0
    vd = _mm_unpackhi_epi32( va23b23, vc23d23 );
1562
0
    ve = _mm_unpacklo_epi32( ve01f01, vg01h01 );
1563
0
    vf = _mm_unpackhi_epi32( ve01f01, vg01h01 );
1564
0
    vg = _mm_unpacklo_epi32( ve23f23, vg23h23 );
1565
0
    vh = _mm_unpackhi_epi32( ve23f23, vg23h23 );
1566
1567
0
    va01b01 = _mm_unpacklo_epi64( va, ve );
1568
0
    va23b23 = _mm_unpackhi_epi64( va, ve );
1569
0
    vc01d01 = _mm_unpacklo_epi64( vb, vf );
1570
0
    vc23d23 = _mm_unpackhi_epi64( vb, vf );
1571
0
    ve01f01 = _mm_unpacklo_epi64( vc, vg );
1572
0
    ve23f23 = _mm_unpackhi_epi64( vc, vg );
1573
0
    vg01h01 = _mm_unpacklo_epi64( vd, vh );
1574
0
    vg23h23 = _mm_unpackhi_epi64( vd, vh );
1575
1576
0
    _mm_storeu_si128( ( __m128i* ) dst, va01b01 ); dst += dstStride;
1577
0
    _mm_storeu_si128( ( __m128i* ) dst, va23b23 ); dst += dstStride;
1578
0
    _mm_storeu_si128( ( __m128i* ) dst, vc01d01 ); dst += dstStride;
1579
0
    _mm_storeu_si128( ( __m128i* ) dst, vc23d23 ); dst += dstStride;
1580
0
    _mm_storeu_si128( ( __m128i* ) dst, ve01f01 ); dst += dstStride;
1581
0
    _mm_storeu_si128( ( __m128i* ) dst, ve23f23 ); dst += dstStride;
1582
0
    _mm_storeu_si128( ( __m128i* ) dst, vg01h01 ); dst += dstStride;
1583
0
    _mm_storeu_si128( ( __m128i* ) dst, vg23h23 );
1584
1585
0
  }
1586
#if USE_AVX2
1587
1588
  _mm256_zeroupper();
1589
#endif
1590
0
}
Unexecuted instantiation: void vvenc::transposeNxN_SSE<(vvenc::x86_simd::X86_VEXT)1, 4>(short const*, int, short*, int)
Unexecuted instantiation: void vvenc::transposeNxN_SSE<(vvenc::x86_simd::X86_VEXT)1, 8>(short const*, int, short*, int)
Unexecuted instantiation: void vvenc::transposeNxN_SSE<(vvenc::x86_simd::X86_VEXT)4, 4>(short const*, int, short*, int)
Unexecuted instantiation: void vvenc::transposeNxN_SSE<(vvenc::x86_simd::X86_VEXT)4, 8>(short const*, int, short*, int)
1591
1592
template<X86_VEXT vext>
1593
void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, const Pel* lut )
1594
0
{
1595
#if USE_AVX2 && ! ENABLE_VALGRIND_CODE // valgrind will report _mm256_i32gather_epi32 to access uninitialized memory
1596
  // this implementation is only faster on modern CPUs
1597
0
  if( ( width & 15 ) == 0 && ( height & 1 ) == 0 )
1598
0
  {
1599
0
    const __m256i vLutShuf = _mm256_setr_epi8( 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 );
1600
1601
0
    for( int y = 0; y < height; y += 2 )
1602
0
    {
1603
0
      for( int x = 0; x < width; x += 16 )
1604
0
      {
1605
0
        __m256i vin16    = _mm256_loadu_si256       ( ( const __m256i * ) &src[x] );
1606
                                                    
1607
0
        __m256i vin32_1  = _mm256_unpacklo_epi16    ( vin16, _mm256_setzero_si256() );
1608
0
        __m256i vin32_2  = _mm256_unpackhi_epi16    ( vin16, _mm256_setzero_si256() );
1609
1610
0
        __m256i vout32_1 = _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_1, 2 );
1611
0
        __m256i vout32_2 = _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_2, 2 );
1612
1613
0
        vout32_1         = _mm256_shuffle_epi8      ( vout32_1, vLutShuf );
1614
0
        vout32_2         = _mm256_shuffle_epi8      ( vout32_2, vLutShuf );
1615
1616
0
        __m256i vout16   = _mm256_unpacklo_epi64    ( vout32_1, vout32_2 );
1617
1618
0
        _mm256_storeu_si256( ( __m256i * ) &dst[x], vout16 );
1619
        
1620
0
        vin16            = _mm256_loadu_si256       ( ( const __m256i * ) &src[x + srcStride] );
1621
                                                    
1622
0
        vin32_1          = _mm256_unpacklo_epi16    ( vin16, _mm256_setzero_si256() );
1623
0
        vin32_2          = _mm256_unpackhi_epi16    ( vin16, _mm256_setzero_si256() );
1624
                         
1625
0
        vout32_1         = _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_1, 2 );
1626
0
        vout32_2         = _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_2, 2 );
1627
1628
0
        vout32_1         = _mm256_shuffle_epi8      ( vout32_1, vLutShuf );
1629
0
        vout32_2         = _mm256_shuffle_epi8      ( vout32_2, vLutShuf );
1630
1631
0
        vout16           = _mm256_unpacklo_epi64    ( vout32_1, vout32_2 );
1632
1633
0
        _mm256_storeu_si256( ( __m256i * ) &dst[x + dstStride], vout16 );
1634
0
      }
1635
1636
0
      src += ( srcStride << 1 );
1637
0
      dst += ( dstStride << 1 );
1638
0
    }
1639
1640
0
    _mm256_zeroupper();
1641
0
  }
1642
0
  else
1643
0
#endif
1644
0
  {
1645
0
#define RSP_SGNL_OP( ADDR ) dst[ADDR] = lut[src[ADDR]]
1646
0
#define RSP_SGNL_INC        src += srcStride; dst += dstStride;
1647
1648
0
    SIZE_AWARE_PER_EL_OP( RSP_SGNL_OP, RSP_SGNL_INC )
1649
1650
0
#undef RSP_SGNL_OP
1651
0
#undef RSP_SGNL_INC
1652
0
  }
1653
0
}
Unexecuted instantiation: void vvenc::applyLut_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short*, long, int, int, short const*)
Unexecuted instantiation: void vvenc::applyLut_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short*, long, int, int, short const*)
1654
1655
#if INTPTR_MAX == INT64_MAX
1656
template<X86_VEXT vext>
1657
void fillPtrMap_SIMD( void** ptr, ptrdiff_t ptrStride, int width, int height, void* val )
1658
0
{
1659
0
  static_assert( sizeof( ptr ) == 8, "Only supported for 64bit systems!" );
1660
0
  if( ( width & 3 ) == 0 )
1661
0
  {
1662
#if USE_AVX2
1663
    __m256i vval = _mm256_set1_epi64x( ( int64_t ) val );
1664
1665
0
    while( height-- )
1666
0
    {
1667
0
      for( int x = 0; x < width; x += 4 ) _mm256_storeu_si256( ( __m256i* ) &ptr[x], vval );
1668
1669
0
      ptr += ptrStride;
1670
0
    }
1671
#else
1672
    __m128i vval = _mm_set1_epi64x( ( int64_t ) val );
1673
1674
0
    while( height-- )
1675
0
    {
1676
0
      for( int x = 0; x < width; x += 4 )
1677
0
      {
1678
0
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 0], vval );
1679
0
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 2], vval );
1680
0
      }
1681
1682
0
      ptr += ptrStride;
1683
0
    }
1684
#endif
1685
0
  }
1686
0
  else if( ( width & 1 ) == 0 )
1687
0
  {
1688
0
    __m128i vval = _mm_set1_epi64x( ( int64_t ) val );
1689
1690
0
    while( height-- )
1691
0
    {
1692
0
      for( int x = 0; x < width; x += 2 ) _mm_storeu_si128( ( __m128i* ) &ptr[x], vval );
1693
1694
0
      ptr += ptrStride;
1695
0
    }
1696
0
  }
1697
0
  else
1698
0
  {
1699
0
    while( height-- )
1700
0
    {
1701
0
      *ptr = val; ptr += ptrStride;
1702
0
    }
1703
0
  }
1704
0
}
Unexecuted instantiation: void vvenc::fillPtrMap_SIMD<(vvenc::x86_simd::X86_VEXT)1>(void**, long, int, int, void*)
Unexecuted instantiation: void vvenc::fillPtrMap_SIMD<(vvenc::x86_simd::X86_VEXT)4>(void**, long, int, int, void*)
1705
#elif INTPTR_MAX == INT32_MAX
1706
template<X86_VEXT vext>
1707
void fillPtrMap_SIMD( void** ptr, ptrdiff_t ptrStride, int width, int height, void* val )
1708
{
1709
  static_assert( sizeof( ptr ) == 4, "Only supported for 32bit systems!" );
1710
  if( ( width & 7 ) == 0 )
1711
  {
1712
#if USE_AVX2
1713
    __m256i vval = _mm256_set1_epi32( ( int32_t ) val );
1714
1715
    while( height-- )
1716
    {
1717
      for( int x = 0; x < width; x += 8 )
1718
      {
1719
        _mm256_storeu_si256( ( __m256i* ) &ptr[x], vval );
1720
      }
1721
1722
      ptr += ptrStride;
1723
    }
1724
#else
1725
    __m128i vval = _mm_set1_epi32( ( int32_t ) val );
1726
1727
    while( height-- )
1728
    {
1729
      for( int x = 0; x < width; x += 8 )
1730
      {
1731
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 0], vval );
1732
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 4], vval );
1733
      }
1734
1735
      ptr += ptrStride;
1736
    }
1737
#endif
1738
  }
1739
  else if( ( width & 3 ) == 0 )
1740
  {
1741
    __m128i vval = _mm_set1_epi32( ( int32_t ) val );
1742
1743
    while( height-- )
1744
    {
1745
      for( int x = 0; x < width; x += 4 )
1746
      {
1747
        _mm_storeu_si128( ( __m128i* ) &ptr[x], vval );
1748
      }
1749
1750
      ptr += ptrStride;
1751
    }
1752
  }
1753
  else if( ( width & 1 ) == 0 )
1754
  {
1755
    while( height-- )
1756
    {
1757
      ptr[0] = val;
1758
      ptr[1] = val;
1759
1760
      ptr += ptrStride;
1761
    }
1762
  }
1763
  else
1764
  {
1765
    while( height-- )
1766
    {
1767
      for( int x = 0; x < width; ++x )
1768
      {
1769
        ptr[x] = val;
1770
      }
1771
      ptr += ptrStride;
1772
    }
1773
  }
1774
}
1775
#endif  // INTPTR_MAX == INT32_MAX
1776
1777
template<X86_VEXT vext>
1778
uint64_t AvgHighPass_SIMD( const int width, const int height, const Pel* pSrc, const int iSrcStride)
1779
0
{
1780
0
  uint64_t saAct=0;
1781
0
  pSrc -= iSrcStride;
1782
1783
#ifdef USE_AVX2
1784
  int x;
1785
  int sum;
1786
1787
0
  if (width > 16)
1788
0
  {
1789
0
    __m256i scale1 = _mm256_set_epi16 (0,-1,-2,-1,0,-1,-2,-1,0,-1,-2,-1,0,-1,-2,-1);
1790
0
    __m256i scale0 = _mm256_set_epi16 (0,-2,12,-2,0,-2,12,-2,0,-2,12,-2,0,-2,12,-2);
1791
0
    __m256i scale11 = _mm256_set_epi16(0,0,0,0,0,-1,-2,-1,0,-1,-2,-1,0,-1,-2,-1);
1792
0
    __m256i scale00 = _mm256_set_epi16 (0,0,0,0,0,-2,12,-2,0,-2,12,-2,0,-2,12,-2);
1793
0
    __m256i tmp1, tmp2, tmp3;
1794
0
    __m256i line0, lineP1, lineM1;
1795
1796
0
    for (int y = 1; y < height-1; y += 1)
1797
0
    {
1798
0
      for (x = 1; x < width-1-14; x += 14)
1799
0
      {
1800
0
        sum=0;
1801
0
        lineM1 = _mm256_lddqu_si256 ((__m256i*) &pSrc[ (y -1)  *iSrcStride + x-1]);
1802
0
        line0  = _mm256_lddqu_si256 ((__m256i*) &pSrc [(y)*iSrcStride + x-1]);
1803
0
        lineP1 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y+1)*iSrcStride + x-1]);
1804
1805
0
        tmp1 = _mm256_madd_epi16 (line0, scale0);
1806
0
        tmp2 = _mm256_madd_epi16 (lineP1, scale1);
1807
0
        tmp3 = _mm256_madd_epi16 (lineM1, scale1);
1808
0
        tmp1 = _mm256_add_epi32(tmp1,tmp2);
1809
0
        tmp1 = _mm256_add_epi32(tmp1,tmp3);
1810
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1811
0
        tmp1 = _mm256_abs_epi32(tmp1);
1812
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1813
0
        sum+=_mm256_extract_epi32 (tmp1, 0);
1814
0
        sum+=_mm256_extract_epi32 (tmp1, 4);
1815
1816
0
        line0  = _mm256_bsrli_epi128 (line0 , 2);
1817
0
        lineP1 = _mm256_bsrli_epi128 (lineP1, 2);
1818
0
        lineM1 = _mm256_bsrli_epi128 (lineM1, 2);
1819
0
        tmp1 = _mm256_madd_epi16 (line0, scale0);
1820
0
        tmp2 = _mm256_madd_epi16 (lineP1, scale1);
1821
0
        tmp3 = _mm256_madd_epi16 (lineM1, scale1);
1822
1823
0
        tmp1 = _mm256_add_epi32(tmp1,tmp2);
1824
0
        tmp1 = _mm256_add_epi32(tmp1,tmp3);
1825
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1826
0
        tmp1 = _mm256_abs_epi32(tmp1);
1827
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1828
1829
0
        sum+=_mm256_extract_epi32 (tmp1, 0);
1830
0
        sum+=_mm256_extract_epi32 (tmp1, 4);
1831
1832
0
        lineM1 = _mm256_lddqu_si256 ((__m256i*) &pSrc[ (y -1)  *iSrcStride + x-1+2]);
1833
0
        line0  = _mm256_lddqu_si256 ((__m256i*) &pSrc [(y)*iSrcStride + x-1+2]);
1834
0
        lineP1 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y+1)*iSrcStride + x-1+2]);
1835
0
        tmp1 = _mm256_madd_epi16 (line0, scale00);
1836
0
        tmp2 = _mm256_madd_epi16 (lineP1, scale11);
1837
0
        tmp3 = _mm256_madd_epi16 (lineM1, scale11);
1838
0
        tmp1 = _mm256_add_epi32(tmp1,tmp2);
1839
0
        tmp1 = _mm256_add_epi32(tmp1,tmp3);
1840
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1841
0
        tmp1 = _mm256_abs_epi32(tmp1);
1842
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1843
0
        sum+=_mm256_extract_epi32 (tmp1, 0);
1844
0
        sum+=_mm256_extract_epi32 (tmp1, 4);
1845
1846
0
        line0  = _mm256_bsrli_epi128 (line0 , 2);
1847
0
        lineP1 = _mm256_bsrli_epi128 (lineP1, 2);
1848
0
        lineM1 = _mm256_bsrli_epi128 (lineM1, 2);
1849
1850
0
        tmp1 = _mm256_madd_epi16 (line0, scale00);
1851
0
        tmp2 = _mm256_madd_epi16 (lineP1, scale11);
1852
0
        tmp3 = _mm256_madd_epi16 (lineM1, scale11);
1853
1854
0
        tmp1 = _mm256_add_epi32(tmp1,tmp2);
1855
0
        tmp1 = _mm256_add_epi32(tmp1,tmp3);
1856
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1857
0
        tmp1 = _mm256_abs_epi32(tmp1);
1858
0
        tmp1 = _mm256_hadd_epi32(tmp1,tmp1);
1859
1860
0
        sum+=_mm256_extract_epi32 (tmp1, 0);
1861
0
        sum+=_mm256_extract_epi32 (tmp1, 4);
1862
0
        saAct += (uint64_t) sum;
1863
0
      }
1864
      // last collum
1865
0
      for (; x < width - 1; x++) //
1866
0
      {
1867
0
        const int s = 12 * (int) pSrc[x  + y*iSrcStride ] - 2 * ((int) pSrc[x-1+y*iSrcStride] + (int) pSrc[x+1+y*iSrcStride] + (int) pSrc[x  -iSrcStride+y*iSrcStride] + (int) pSrc[x  +iSrcStride+y*iSrcStride])
1868
0
                                                       - ((int) pSrc[x-1-iSrcStride+y*iSrcStride] + (int) pSrc[x+1-iSrcStride+y*iSrcStride] + (int) pSrc[x-1+iSrcStride+y*iSrcStride] + (int) pSrc[x+1+iSrcStride+y*iSrcStride]);
1869
0
        saAct += abs (s);
1870
0
      }
1871
0
    }
1872
0
  }
1873
0
  else
1874
0
#endif
1875
0
  {
1876
0
    int x;
1877
0
    int sum;
1878
1879
0
    __m128i scale1 = _mm_set_epi16 (0,-1,-2,-1,0,-1,-2,-1);
1880
0
    __m128i scale0 = _mm_set_epi16 (0,-2,12,-2,0,-2,12,-2);
1881
0
    __m128i scale11 = _mm_set_epi16(0,0,0,0,0,-1,-2,-1);
1882
0
    __m128i scale00 = _mm_set_epi16 (0,0,0,0,0,-2,12,-2);
1883
0
    __m128i tmp1, tmp2, tmp3;
1884
0
    __m128i line0, lineP1, lineM1;
1885
1886
0
    for (int y = 1; y < height-1; y += 1)
1887
0
    {
1888
0
      for (x = 1; x < width-1-6; x += 6)
1889
0
      {
1890
0
        sum=0;
1891
0
        lineM1 = _mm_loadu_si128 ((__m128i*) &pSrc[ (y -1)  *iSrcStride + x-1]);
1892
0
        line0  = _mm_loadu_si128 ((__m128i*) &pSrc [(y)*iSrcStride + x-1]);
1893
0
        lineP1 = _mm_loadu_si128 ((__m128i*) &pSrc[(y+1)*iSrcStride + x-1]);
1894
1895
0
        tmp1 = _mm_madd_epi16 (line0, scale0);
1896
0
        tmp2 = _mm_madd_epi16 (lineP1, scale1);
1897
0
        tmp3 = _mm_madd_epi16 (lineM1, scale1);
1898
0
        tmp1 = _mm_add_epi32(tmp1,tmp2);
1899
0
        tmp1 = _mm_add_epi32(tmp1,tmp3);
1900
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1901
0
        tmp1 = _mm_abs_epi32(tmp1);
1902
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1903
0
        sum+=_mm_extract_epi32 (tmp1, 0);
1904
1905
0
        line0  = _mm_bsrli_si128 (line0 , 2);
1906
0
        lineP1 = _mm_bsrli_si128 (lineP1, 2);
1907
0
        lineM1 = _mm_bsrli_si128 (lineM1, 2);
1908
0
        tmp1 = _mm_madd_epi16 (line0, scale0);
1909
0
        tmp2 = _mm_madd_epi16 (lineP1, scale1);
1910
0
        tmp3 = _mm_madd_epi16 (lineM1, scale1);
1911
1912
0
        tmp1 = _mm_add_epi32(tmp1,tmp2);
1913
0
        tmp1 = _mm_add_epi32(tmp1,tmp3);
1914
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1915
0
        tmp1 = _mm_abs_epi32(tmp1);
1916
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1917
1918
0
        sum+=_mm_extract_epi32 (tmp1, 0);
1919
1920
0
        lineM1 = _mm_loadu_si128 ((__m128i*) &pSrc[ (y -1)  *iSrcStride + x-1+2]);
1921
0
        line0  = _mm_loadu_si128 ((__m128i*) &pSrc [(y)*iSrcStride + x-1+2]);
1922
0
        lineP1 = _mm_loadu_si128 ((__m128i*) &pSrc[(y+1)*iSrcStride + x-1+2]);
1923
0
        tmp1 = _mm_madd_epi16 (line0, scale00);
1924
0
        tmp2 = _mm_madd_epi16 (lineP1, scale11);
1925
0
        tmp3 = _mm_madd_epi16 (lineM1, scale11);
1926
0
        tmp1 = _mm_add_epi32(tmp1,tmp2);
1927
0
        tmp1 = _mm_add_epi32(tmp1,tmp3);
1928
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1929
0
        tmp1 = _mm_abs_epi32(tmp1);
1930
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1931
0
        sum+=_mm_extract_epi32 (tmp1, 0);
1932
1933
0
        line0  = _mm_bsrli_si128 (line0 , 2);
1934
0
        lineP1 = _mm_bsrli_si128 (lineP1, 2);
1935
0
        lineM1 = _mm_bsrli_si128 (lineM1, 2);
1936
1937
0
        tmp1 = _mm_madd_epi16 (line0, scale00);
1938
0
        tmp2 = _mm_madd_epi16 (lineP1, scale11);
1939
0
        tmp3 = _mm_madd_epi16 (lineM1, scale11);
1940
1941
0
        tmp1 = _mm_add_epi32(tmp1,tmp2);
1942
0
        tmp1 = _mm_add_epi32(tmp1,tmp3);
1943
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1944
0
        tmp1 = _mm_abs_epi32(tmp1);
1945
0
        tmp1 = _mm_hadd_epi32(tmp1,tmp1);
1946
1947
0
        sum+=_mm_extract_epi32 (tmp1, 0);
1948
0
        saAct += (uint64_t) sum;
1949
0
      }
1950
1951
      // last collum
1952
0
      for (; x < width - 1; x++) //
1953
0
      {
1954
0
        const int s = 12 * (int) pSrc[x  + y*iSrcStride ] - 2 * ((int) pSrc[x-1+y*iSrcStride] + (int) pSrc[x+1+y*iSrcStride] + (int) pSrc[x  -iSrcStride+y*iSrcStride] + (int) pSrc[x  +iSrcStride+y*iSrcStride])
1955
0
                                                       - ((int) pSrc[x-1-iSrcStride+y*iSrcStride] + (int) pSrc[x+1-iSrcStride+y*iSrcStride] + (int) pSrc[x-1+iSrcStride+y*iSrcStride] + (int) pSrc[x+1+iSrcStride+y*iSrcStride]);
1956
0
        saAct += abs (s);
1957
0
      }
1958
0
    }
1959
0
  }
1960
0
  return saAct;
1961
0
}
Unexecuted instantiation: unsigned long vvenc::AvgHighPass_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, short const*, int)
Unexecuted instantiation: unsigned long vvenc::AvgHighPass_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, short const*, int)
1962
1963
template<X86_VEXT vext>
1964
uint64_t HDHighPass_SIMD  (const int width, const int height,const Pel*  pSrc,const Pel* pSM1,const int iSrcStride,const int iSM1Stride)
1965
0
{
1966
0
  uint64_t taAct = 0;
1967
0
  uint16_t act = 0;
1968
0
  const __m128i scale1 = _mm_set_epi16 (1,1,1,1,1,1,1,1);
1969
0
  pSrc -= iSrcStride;
1970
0
  pSM1 -= iSM1Stride;
1971
0
  int x;
1972
0
  if (width>8)
1973
0
  {
1974
0
    for (int y = 1; y < height - 1; y++)
1975
0
    {
1976
0
      for (x = 1; x < width - 1-8 ; x+=8)  // cnt cols
1977
0
      {
1978
0
        __m128i M0 = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
1979
0
        __m128i M1 = _mm_loadu_si128 ((__m128i*) &pSM1  [y *iSM1Stride + x]);
1980
0
        M1 = _mm_sub_epi16 (M0, M1);
1981
0
        M1 = _mm_abs_epi16 (M1);
1982
0
        M1 = _mm_hadd_epi16 (M1, M1);
1983
1984
        //  (1 + 3 * abs (t)) >> 1
1985
0
        M0 = _mm_add_epi16(M1,M1);
1986
0
        M1 = _mm_add_epi16(M0,M1);
1987
0
        M1 = _mm_add_epi16(M1,scale1);
1988
0
        M1 = _mm_srai_epi16 (M1,1);
1989
1990
0
        M1 = _mm_hadds_epi16 (M1, M1);
1991
0
        M1 = _mm_hadds_epi16 (M1, M1);
1992
0
        _mm_storeu_si16 (&act, M1);
1993
0
        taAct += (uint64_t)act;
1994
0
      }
1995
      // last collum
1996
0
      __m128i M0 = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
1997
0
      __m128i M1 = _mm_loadu_si128 ((__m128i*) &pSM1  [y *iSM1Stride + x]);
1998
1999
0
      M1 = _mm_sub_epi16 (M0, M1);
2000
0
      M1 = _mm_abs_epi16 (M1);
2001
0
      int n=8-width+1+x;
2002
0
      if (n > 0)
2003
0
      {
2004
        //remove n Pixel
2005
0
        switch (n)
2006
0
        {
2007
0
        case 1:
2008
0
        {
2009
0
          M1 = _mm_slli_si128 (M1,2);
2010
0
          M1 = _mm_srli_si128 (M1,2);
2011
0
          break;
2012
0
        }
2013
0
        case 2:
2014
0
        {
2015
0
          M1 = _mm_slli_si128 (M1,4);
2016
0
          M1 = _mm_srli_si128 (M1,4);
2017
0
          break;
2018
0
        }
2019
0
        case 3:
2020
0
        {
2021
0
          M1 = _mm_slli_si128 (M1,6);
2022
0
          M1 = _mm_srli_si128 (M1,6);
2023
0
          break;
2024
0
        }
2025
0
        case 4:
2026
0
        {
2027
0
          M1 = _mm_slli_si128 (M1,8);
2028
0
          M1 = _mm_srli_si128 (M1,8);
2029
0
          break;
2030
0
        }
2031
0
        case 5:
2032
0
        {
2033
0
          M1 = _mm_slli_si128 (M1,10);
2034
0
          M1 = _mm_srli_si128 (M1,10);
2035
0
          break;
2036
0
        }
2037
0
        case 6:
2038
0
        {
2039
0
          M1 = _mm_slli_si128 (M1,12);
2040
0
          M1 = _mm_srli_si128 (M1,12);
2041
0
          break;
2042
0
        }
2043
0
        case 7:
2044
0
        {
2045
0
          M1 = _mm_slli_si128 (M1,14);
2046
0
          M1 = _mm_srli_si128 (M1,14);
2047
0
          break;
2048
0
        }
2049
0
        }
2050
0
      }
2051
0
      M1 = _mm_hadd_epi16 (M1, M1);
2052
      //  (1 + 3 * abs (t)) >> 1
2053
0
      M0 = _mm_add_epi16(M1,M1);
2054
0
      M1 = _mm_add_epi16(M0,M1);
2055
0
      M1 = _mm_add_epi16(M1,scale1);
2056
0
      M1 = _mm_srai_epi16 (M1,1);
2057
2058
0
      M1 = _mm_hadds_epi16 (M1, M1);
2059
0
      M1 = _mm_hadds_epi16 (M1, M1);
2060
0
      _mm_storeu_si16 (&act, M1);
2061
0
      taAct += (uint64_t)act;
2062
0
    }
2063
0
  }
2064
0
  else
2065
0
  {
2066
0
    for (int y = 1; y < height - 1; y++)
2067
0
    {
2068
0
      for (int x = 1; x < width - 1; x++)  // cnt cols
2069
0
      {
2070
0
        const int t = (int) pSrc[x] - (int) pSM1[x];
2071
0
        taAct += (1 + 3 * abs (t)) >> 1;
2072
0
      }
2073
0
      pSrc += iSrcStride;
2074
0
      pSM1 += iSM1Stride;
2075
0
    }
2076
0
  }
2077
0
  return taAct;
2078
0
}
Unexecuted instantiation: unsigned long vvenc::HDHighPass_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, short const*, short const*, int, int)
Unexecuted instantiation: unsigned long vvenc::HDHighPass_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, short const*, short const*, int, int)
2079
2080
template<X86_VEXT vext>
2081
uint64_t  HDHighPass2_SIMD  (const int width, const int height,const Pel*  pSrc,const Pel* pSM1,const Pel* pSM2,const int iSrcStride,const int iSM1Stride,const int iSM2Stride)
2082
0
{
2083
0
  uint64_t taAct = 0;
2084
0
  uint16_t act = 0;
2085
0
  pSrc -= iSrcStride;
2086
0
  pSM1 -= iSM1Stride;
2087
0
  pSM2 -= iSM2Stride;
2088
0
  int x;
2089
0
  if (width>8)
2090
0
  {
2091
0
    for (int y = 1; y < height - 1; y++)
2092
0
    {
2093
0
      for (x = 1; x < width - 1-8 ; x+=8)  // cnt cols
2094
0
      {
2095
0
        __m128i M0 = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
2096
0
        __m128i M1 = _mm_loadu_si128 ((__m128i*) &pSM1  [y *iSM1Stride + x]);
2097
0
        __m128i M2 = _mm_loadu_si128 ((__m128i*) &pSM2  [y *iSM2Stride + x]);
2098
0
        M1 = _mm_slli_epi16 (M1, 1);
2099
0
        M1 = _mm_sub_epi16 (M0, M1);
2100
0
        M1 = _mm_add_epi16 (M1,M2);
2101
0
        M1 = _mm_abs_epi16 (M1);
2102
0
        M1 = _mm_hadd_epi16 (M1, M1);
2103
2104
0
        M1 = _mm_hadds_epi16 (M1, M1);
2105
0
        M1 = _mm_hadds_epi16 (M1, M1);
2106
0
        _mm_storeu_si16 (&act, M1);
2107
0
        taAct += (uint64_t)act;
2108
0
      }
2109
      // last collum
2110
0
      __m128i M0 = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
2111
0
      __m128i M1 = _mm_loadu_si128 ((__m128i*) &pSM1  [y *iSM1Stride + x]);
2112
0
      __m128i M2 = _mm_loadu_si128 ((__m128i*) &pSM2  [y *iSM2Stride + x]);
2113
0
      M1 = _mm_slli_epi16 (M1, 1);
2114
0
      M1 = _mm_sub_epi16 (M0, M1);
2115
0
      M1 = _mm_add_epi16 (M1,M2);
2116
0
      M1 = _mm_abs_epi16 (M1);
2117
0
      int n=8-width+1+x;
2118
0
      if (n > 0)
2119
0
      {
2120
0
        switch (n)
2121
0
        {
2122
0
        case 1:
2123
0
        {
2124
0
          M1 = _mm_slli_si128 (M1,2);
2125
0
          M1 = _mm_srli_si128 (M1,2);
2126
0
          break;
2127
0
        }
2128
0
        case 2:
2129
0
        {
2130
0
          M1 = _mm_slli_si128 (M1,4);
2131
0
          M1 = _mm_srli_si128 (M1,4);
2132
0
          break;
2133
0
        }
2134
0
        case 3:
2135
0
        {
2136
0
          M1 = _mm_slli_si128 (M1,6);
2137
0
          M1 = _mm_srli_si128 (M1,6);
2138
0
          break;
2139
0
        }
2140
0
        case 4:
2141
0
        {
2142
0
          M1 = _mm_slli_si128 (M1,8);
2143
0
          M1 = _mm_srli_si128 (M1,8);
2144
0
          break;
2145
0
        }
2146
0
        case 5:
2147
0
        {
2148
0
          M1 = _mm_slli_si128 (M1,10);
2149
0
          M1 = _mm_srli_si128 (M1,10);
2150
0
          break;
2151
0
        }
2152
0
        case 6:
2153
0
        {
2154
0
          M1 = _mm_slli_si128 (M1,12);
2155
0
          M1 = _mm_srli_si128 (M1,12);
2156
0
          break;
2157
0
        }
2158
0
        case 7:
2159
0
        {
2160
0
          M1 = _mm_slli_si128 (M1,14);
2161
0
          M1 = _mm_srli_si128 (M1,14);
2162
0
          break;
2163
0
        }
2164
0
        }
2165
0
      }
2166
0
      M1 = _mm_hadd_epi16 (M1, M1);
2167
0
      M1 = _mm_hadds_epi16 (M1, M1);
2168
0
      M1 = _mm_hadds_epi16 (M1, M1);
2169
0
      _mm_storeu_si16 (&act, M1);
2170
0
      taAct += (uint64_t)act;
2171
0
    }
2172
0
  }
2173
0
  else
2174
0
  {
2175
0
    for (int y = 1; y < height - 1; y++)
2176
0
    {
2177
0
      for (int x = 1; x < width - 1; x++)  // cnt cols
2178
0
      {
2179
0
        const int t = (int) pSrc[x] - 2 * (int) pSM1[x] + (int) pSM2[x];
2180
0
        taAct += abs (t);
2181
0
      }
2182
0
      pSrc += iSrcStride;
2183
0
      pSM1 += iSM1Stride;
2184
0
      pSM2 += iSM2Stride;
2185
0
    }
2186
0
  }
2187
0
  return taAct;
2188
0
}
Unexecuted instantiation: unsigned long vvenc::HDHighPass2_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, short const*, short const*, short const*, int, int, int)
Unexecuted instantiation: unsigned long vvenc::HDHighPass2_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, short const*, short const*, short const*, int, int, int)
2189
2190
template<X86_VEXT vext>
2191
uint64_t AvgHighPassWithDownsampling_SIMD ( const int width, const int height, const Pel* pSrc, const int iSrcStride)
2192
0
{
2193
0
  uint64_t saAct = 0;
2194
0
  pSrc -= iSrcStride;
2195
0
  pSrc -= iSrcStride;
2196
2197
#ifdef USE_AVX2
2198
0
  if (width > 12)
2199
0
  {
2200
0
    const __m128i scale1 = _mm_set_epi16 (0, 0,-1,-2,-3,-3,-2,-1);
2201
0
    const __m128i scale2 = _mm_set_epi16 (0, 0,-1,-3,12,12,-3,-1);
2202
0
    const __m128i scale3 = _mm_set_epi16 (0, 0, 0,-1,-1,-1,-1, 0);
2203
0
    __m128i tmp1, tmp2,tmp3,tmp4,tmp5;
2204
0
    __m128i l0, lP1, lM1, lP2, lM2, lP3;
2205
2206
    int sum;
2207
2208
0
    for (int y = 2; y < height-2; y += 2)
2209
0
    {
2210
0
      for (int x = 2; x < width-2; x += 12)
2211
0
      {
2212
0
        __m256i lineM2 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y-2)*iSrcStride + x-2]);
2213
0
        __m256i lineM1 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y-1)*iSrcStride + x-2]);
2214
0
        __m256i line0  = _mm256_lddqu_si256 ((__m256i*) &pSrc[ y   *iSrcStride + x-2]);
2215
0
        __m256i lineP1 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y+1)*iSrcStride + x-2]);
2216
0
        __m256i lineP2 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y+2)*iSrcStride + x-2]);
2217
0
        __m256i lineP3 = _mm256_lddqu_si256 ((__m256i*) &pSrc[(y+3)*iSrcStride + x-2]);
2218
2219
0
        for (int xx = 0; xx < 3; xx++)
2220
0
        {
2221
0
          l0  = _mm256_castsi256_si128 (line0 );
2222
0
          lP1 = _mm256_castsi256_si128 (lineP1);
2223
0
          lM1 = _mm256_castsi256_si128 (lineM1);
2224
0
          lP2 = _mm256_castsi256_si128 (lineP2);
2225
0
          lM2 = _mm256_castsi256_si128 (lineM2);
2226
0
          lP3 = _mm256_castsi256_si128 (lineP3);
2227
2228
0
          if ((xx << 2) + x < width-2)
2229
0
          {
2230
0
            sum = 0;
2231
0
            tmp1 = _mm_madd_epi16 (l0, scale2);
2232
0
            tmp2 = _mm_madd_epi16 (lP1, scale2);
2233
0
            tmp3 = _mm_add_epi32 (tmp1, tmp2);
2234
0
            tmp1 = _mm_madd_epi16 (lM1, scale1);
2235
0
            tmp2 = _mm_madd_epi16 (lP2, scale1);
2236
0
            tmp4 = _mm_add_epi32(tmp1,tmp2);
2237
0
            tmp4 = _mm_add_epi32(tmp4,tmp3);
2238
0
            tmp1 = _mm_madd_epi16 (lM2, scale3);
2239
0
            tmp2 = _mm_madd_epi16 (lP3, scale3);
2240
0
            tmp5 = _mm_add_epi32(tmp1,tmp2);
2241
0
            tmp4 = _mm_add_epi32(tmp4,tmp5);
2242
0
            tmp1 = _mm_hadd_epi32 (tmp4, tmp4);
2243
0
            tmp1 = _mm_hadd_epi32 (tmp1, tmp1);
2244
0
            tmp1 = _mm_abs_epi32(tmp1);
2245
0
            sum += _mm_extract_epi32 (tmp1, 0);
2246
0
            saAct += (uint64_t) sum;
2247
0
           }
2248
0
          if ((xx << 2) + x + 2 < width-2)
2249
0
          {
2250
0
            sum = 0;
2251
0
            l0  = _mm_bsrli_si128 (l0 , 4);
2252
0
            lP1 = _mm_bsrli_si128 (lP1, 4);
2253
0
            tmp1 = _mm_madd_epi16 (l0, scale2);
2254
0
            tmp2 = _mm_madd_epi16 (lP1, scale2);
2255
0
            tmp3 = _mm_add_epi32 (tmp1, tmp2);
2256
2257
0
            lM1 = _mm_bsrli_si128 (lM1, 4);
2258
0
            lP2 = _mm_bsrli_si128 (lP2, 4);
2259
0
            tmp1 = _mm_madd_epi16 (lM1, scale1);
2260
0
            tmp2 = _mm_madd_epi16 (lP2, scale1);
2261
0
            tmp4 = _mm_add_epi32(tmp1,tmp2);
2262
0
            tmp4 = _mm_add_epi32(tmp4,tmp3);
2263
2264
0
            lM2 = _mm_bsrli_si128 (lM2, 4);
2265
0
            lP3 = _mm_bsrli_si128 (lP3, 4);
2266
0
            tmp1 = _mm_madd_epi16 (lM2, scale3);
2267
0
            tmp2 = _mm_madd_epi16 (lP3, scale3);
2268
0
            tmp5 = _mm_add_epi32(tmp1,tmp2);
2269
0
            tmp4 = _mm_add_epi32(tmp4,tmp5);
2270
0
            tmp1 = _mm_hadd_epi32 (tmp4, tmp4);
2271
0
            tmp1 = _mm_hadd_epi32 (tmp1, tmp1);
2272
0
            tmp1 = _mm_abs_epi32(tmp1);
2273
0
            sum += _mm_extract_epi32 (tmp1, 0);
2274
2275
0
             saAct += (uint64_t) sum;
2276
             /* 4 byte to the right */
2277
0
            lineM2 = _mm256_permute4x64_epi64 (lineM2, 0x39);
2278
0
            lineM1 = _mm256_permute4x64_epi64 (lineM1, 0x39);
2279
0
            line0  = _mm256_permute4x64_epi64 (line0 , 0x39);
2280
0
            lineP1 = _mm256_permute4x64_epi64 (lineP1, 0x39);
2281
0
            lineP2 = _mm256_permute4x64_epi64 (lineP2, 0x39);
2282
0
            lineP3 = _mm256_permute4x64_epi64 (lineP3, 0x39);
2283
0
            }
2284
0
        }
2285
0
      }
2286
0
    }
2287
0
  }
2288
0
  else
2289
0
#endif
2290
0
  {
2291
0
    if (width > 6)
2292
0
    {
2293
0
      const __m128i scale1 = _mm_set_epi16 (0, 0,-1,-2,-3,-3,-2,-1);
2294
0
      const __m128i scale2 = _mm_set_epi16 (0, 0,-1,-3,12,12,-3,-1);
2295
0
      const __m128i scale3 = _mm_set_epi16 (0, 0, 0,-1,-1,-1,-1, 0);
2296
0
      __m128i tmp1, tmp2,tmp3,tmp4,tmp5;
2297
0
      __m128i l0, lP1, lM1, lP2, lM2, lP3;
2298
2299
0
      int sum;
2300
2301
0
      for (int y = 2; y < height-2; y += 2)
2302
0
      {
2303
0
        for (int x = 2; x < width-2; x += 4)
2304
0
        {
2305
0
          {
2306
0
            lM2 = _mm_loadu_si128 ((__m128i*) &pSrc[(y-2)*iSrcStride + x-2]);
2307
0
            lM1 = _mm_loadu_si128 ((__m128i*) &pSrc[(y-1)*iSrcStride + x-2]);
2308
0
            l0  = _mm_loadu_si128 ((__m128i*) &pSrc[ y   *iSrcStride + x-2]);
2309
0
            lP1 = _mm_loadu_si128 ((__m128i*) &pSrc[(y+1)*iSrcStride + x-2]);
2310
0
            lP2 = _mm_loadu_si128 ((__m128i*) &pSrc[(y+2)*iSrcStride + x-2]);
2311
0
            lP3 = _mm_loadu_si128 ((__m128i*) &pSrc[(y+3)*iSrcStride + x-2]);
2312
2313
0
            if ( x < width-2)
2314
0
            {
2315
0
              sum = 0;
2316
0
              tmp1 = _mm_madd_epi16 (l0, scale2);
2317
0
              tmp2 = _mm_madd_epi16 (lP1, scale2);
2318
0
              tmp3 = _mm_add_epi32 (tmp1, tmp2);
2319
0
              tmp1 = _mm_madd_epi16 (lM1, scale1);
2320
0
              tmp2 = _mm_madd_epi16 (lP2, scale1);
2321
0
              tmp4 = _mm_add_epi32(tmp1,tmp2);
2322
0
              tmp4 = _mm_add_epi32(tmp4,tmp3);
2323
0
              tmp1 = _mm_madd_epi16 (lM2, scale3);
2324
0
              tmp2 = _mm_madd_epi16 (lP3, scale3);
2325
0
              tmp5 = _mm_add_epi32(tmp1,tmp2);
2326
0
              tmp4 = _mm_add_epi32(tmp4,tmp5);
2327
0
              tmp1 = _mm_hadd_epi32 (tmp4, tmp4);
2328
0
              tmp1 = _mm_hadd_epi32 (tmp1, tmp1);
2329
0
              tmp1 = _mm_abs_epi32(tmp1);
2330
0
              sum += _mm_extract_epi32 (tmp1, 0);
2331
2332
0
              saAct += (uint64_t) sum;
2333
0
             }
2334
0
            if (x + 2 < width-2)
2335
0
            {
2336
0
              sum = 0;
2337
0
              l0  = _mm_bsrli_si128 (l0 , 4);
2338
0
              lP1 = _mm_bsrli_si128 (lP1, 4);
2339
0
              tmp1 = _mm_madd_epi16 (l0, scale2);
2340
0
              tmp2 = _mm_madd_epi16 (lP1, scale2);
2341
0
              tmp3 = _mm_add_epi32 (tmp1, tmp2);
2342
2343
0
              lM1 = _mm_bsrli_si128 (lM1, 4);
2344
0
              lP2 = _mm_bsrli_si128 (lP2, 4);
2345
0
              tmp1 = _mm_madd_epi16 (lM1, scale1);
2346
0
              tmp2 = _mm_madd_epi16 (lP2, scale1);
2347
0
              tmp4 = _mm_add_epi32(tmp1,tmp2);
2348
0
              tmp4 = _mm_add_epi32(tmp4,tmp3);
2349
2350
0
              lM2 = _mm_bsrli_si128 (lM2, 4);
2351
0
              lP3 = _mm_bsrli_si128 (lP3, 4);
2352
0
              tmp1 = _mm_madd_epi16 (lM2, scale3);
2353
0
              tmp2 = _mm_madd_epi16 (lP3, scale3);
2354
0
              tmp5 = _mm_add_epi32(tmp1,tmp2);
2355
0
              tmp4 = _mm_add_epi32(tmp4,tmp5);
2356
0
              tmp1 = _mm_hadd_epi32 (tmp4, tmp4);
2357
0
              tmp1 = _mm_hadd_epi32 (tmp1, tmp1);
2358
0
              tmp1 = _mm_abs_epi32(tmp1);
2359
0
              sum += _mm_extract_epi32 (tmp1, 0);
2360
0
              saAct += (uint64_t) sum;
2361
0
              }
2362
0
          }
2363
0
        }
2364
0
      }
2365
0
    }
2366
0
 }
2367
0
  return saAct;
2368
0
}
Unexecuted instantiation: unsigned long vvenc::AvgHighPassWithDownsampling_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, short const*, int)
Unexecuted instantiation: unsigned long vvenc::AvgHighPassWithDownsampling_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, short const*, int)
2369
template<X86_VEXT vext>
2370
uint64_t AvgHighPassWithDownsamplingDiff1st_SIMD (const int width, const int height, const Pel *pSrc,const Pel *pSrcM1, const int iSrcStride, const int iSrcM1Stride)
2371
0
{
2372
0
  uint64_t taAct = 0;
2373
0
  uint16_t act = 0;
2374
0
  pSrc -= iSrcStride;
2375
0
  pSrc -= iSrcStride;
2376
0
  pSrcM1-=iSrcM1Stride;
2377
0
  pSrcM1-=iSrcM1Stride;
2378
0
  int32_t x;
2379
0
  int32_t y;
2380
0
  const __m128i scale1 = _mm_set_epi16 (1,1,1,1,1,1,1,1);
2381
0
  for (y = 2; y < height-2; y += 2)
2382
0
  {
2383
0
    for (x = 2; x < width-2-10; x += 8)
2384
0
    {
2385
0
      __m128i lineM0u = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
2386
0
      __m128i lineM0d = _mm_loadu_si128 ((__m128i*) &pSrc  [(y+1)*iSrcStride + x]);
2387
0
      __m128i lineM1u = _mm_loadu_si128 ((__m128i*) &pSrcM1[ y   *iSrcM1Stride + x]);
2388
0
      __m128i lineM1d = _mm_loadu_si128 ((__m128i*) &pSrcM1[(y+1)*iSrcM1Stride + x]);
2389
0
      __m128i M0 = _mm_add_epi16 (lineM0u, lineM0d);
2390
0
      __m128i M1 = _mm_add_epi16 (lineM1u, lineM1d);
2391
2392
0
      M1 = _mm_sub_epi16 (M0, M1); /* abs (sum (o[u0, u1, d0, d1]) - sum (oM1[u0, u1, d0, d1])) */
2393
0
      M1 = _mm_hadd_epi16 (M1, M1);
2394
0
      M1 = _mm_abs_epi16 (M1);
2395
2396
      //  (1 + 3 * abs (t)) >> 1
2397
0
      M0 = _mm_add_epi16(M1,M1);
2398
0
      M1 = _mm_add_epi16(M0,M1);
2399
0
      M1 = _mm_add_epi16(M1,scale1);
2400
0
      M1 = _mm_srai_epi16 (M1,1);
2401
2402
0
      M1 = _mm_hadds_epi16 (M1, M1);
2403
0
      M1 = _mm_hadds_epi16 (M1, M1);
2404
0
      _mm_storeu_si16 (&act, M1);
2405
0
      taAct += (uint64_t)act;
2406
0
    }
2407
    // last collum
2408
0
    {
2409
0
      __m128i lineM0u = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
2410
0
      __m128i lineM0d = _mm_loadu_si128 ((__m128i*) &pSrc  [(y+1)*iSrcStride + x]);
2411
0
      __m128i lineM1u = _mm_loadu_si128 ((__m128i*) &pSrcM1[ y   *iSrcM1Stride + x]);
2412
0
      __m128i lineM1d = _mm_loadu_si128 ((__m128i*) &pSrcM1[(y+1)*iSrcM1Stride + x]);
2413
0
      __m128i M0 = _mm_add_epi16 (lineM0u, lineM0d);
2414
0
      __m128i M1 = _mm_add_epi16 (lineM1u, lineM1d);
2415
0
      M1 = _mm_sub_epi16 (M0, M1); /* abs (sum (o[u0, u1, d0, d1]) - sum (oM1[u0, u1, d0, d1])) */
2416
2417
0
      int n=8-width+2+x;
2418
0
      if (n > 0)
2419
0
      {
2420
        //remove n Pixel
2421
0
        if (n==2)
2422
0
        {
2423
0
          M1 = _mm_slli_si128 (M1, 4);
2424
0
          M1 = _mm_srli_si128 (M1,4);
2425
0
        }
2426
0
        else if  (n==4)
2427
0
        {
2428
0
          M1 = _mm_slli_si128 (M1, 8);
2429
0
          M1 = _mm_srli_si128 (M1,8);
2430
0
        }
2431
0
        else if  (n==6)
2432
0
        {
2433
0
          M1 = _mm_slli_si128 (M1, 12);
2434
0
          M1 = _mm_srli_si128 (M1,12);
2435
0
        }
2436
0
      }
2437
0
      M1 = _mm_hadd_epi16 (M1, M1);
2438
0
      M1 = _mm_abs_epi16 (M1);
2439
2440
      //  (1 + 3 * abs (t)) >> 1
2441
0
      M0 = _mm_add_epi16(M1,M1);
2442
0
      M1 = _mm_add_epi16(M0,M1);
2443
0
      M1 = _mm_add_epi16(M1,scale1);
2444
0
      M1 = _mm_srai_epi16 (M1,1);
2445
2446
0
      M1 = _mm_hadds_epi16 (M1, M1);
2447
0
      M1 = _mm_hadds_epi16 (M1, M1);
2448
0
      _mm_storeu_si16 (&act, M1);
2449
2450
0
      taAct += (uint64_t)act;
2451
0
    }
2452
0
  }
2453
0
  return (taAct);
2454
0
}
Unexecuted instantiation: unsigned long vvenc::AvgHighPassWithDownsamplingDiff1st_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, short const*, short const*, int, int)
Unexecuted instantiation: unsigned long vvenc::AvgHighPassWithDownsamplingDiff1st_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, short const*, short const*, int, int)
2455
template<X86_VEXT vext>
2456
uint64_t AvgHighPassWithDownsamplingDiff2nd_SIMD (const int width,const int height,const Pel* pSrc,const Pel* pSrcM1,const Pel* pSrcM2,const int iSrcStride,const int iSM1Stride,const int iSM2Stride)
2457
0
{
2458
0
  uint64_t taAct = 0;
2459
0
  uint16_t act = 0;
2460
0
  int32_t y;
2461
0
  int32_t x;
2462
0
  pSrc -= iSrcStride;
2463
0
  pSrc -= iSrcStride;
2464
0
  pSrcM1-=iSM1Stride;
2465
0
  pSrcM1-=iSM1Stride;
2466
0
  pSrcM2-=iSM2Stride;
2467
0
  pSrcM2-=iSM2Stride;
2468
2469
0
  for (y = 2; y < height-2; y += 2)
2470
0
  {
2471
0
    for (x = 2; x < width-2-10; x += 8)
2472
0
    {
2473
0
      __m128i lineM0u = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
2474
0
      __m128i lineM0d = _mm_loadu_si128 ((__m128i*) &pSrc  [(y+1)*iSrcStride + x]);
2475
0
      __m128i lineM1u = _mm_loadu_si128 ((__m128i*) &pSrcM1[ y   *iSM1Stride + x]);
2476
0
      __m128i lineM1d = _mm_loadu_si128 ((__m128i*) &pSrcM1[(y+1)*iSM1Stride + x]);
2477
0
      __m128i lineM2u = _mm_loadu_si128 ((__m128i*) &pSrcM2[ y   *iSM2Stride + x]);
2478
0
      __m128i lineM2d = _mm_loadu_si128 ((__m128i*) &pSrcM2[(y+1)*iSM2Stride + x]);
2479
2480
0
      __m128i M0 = _mm_add_epi16 (lineM0u, lineM0d);
2481
0
      __m128i M1 = _mm_add_epi16 (lineM1u, lineM1d);
2482
0
      __m128i M2 = _mm_add_epi16 (lineM2u, lineM2d);
2483
2484
0
      M0 = _mm_add_epi16 (M0, M2);
2485
0
      M0 = _mm_hadd_epi16 (M0, M1);
2486
0
      M1 = _mm_shuffle_epi32 (M0, 0xee);
2487
0
      M1 = _mm_slli_epi16 (M1, 0x1);
2488
0
      M1 = _mm_sub_epi16 (M0, M1);
2489
0
      M1 = _mm_abs_epi16 (M1);
2490
0
      M1 = _mm_hadds_epi16 (M1, M1);
2491
0
      M1 = _mm_hadds_epi16 (M1, M1);
2492
2493
0
      _mm_storeu_si16 (&act, M1);
2494
0
      taAct += (uint64_t) act;
2495
0
    }
2496
    // last collum
2497
0
    {
2498
0
      __m128i lineM0u = _mm_loadu_si128 ((__m128i*) &pSrc  [ y   *iSrcStride + x]); /* load 8 16-bit values */
2499
0
      __m128i lineM0d = _mm_loadu_si128 ((__m128i*) &pSrc  [(y+1)*iSrcStride + x]);
2500
0
      __m128i lineM1u = _mm_loadu_si128 ((__m128i*) &pSrcM1[ y   *iSM1Stride + x]);
2501
0
      __m128i lineM1d = _mm_loadu_si128 ((__m128i*) &pSrcM1[(y+1)*iSM1Stride + x]);
2502
0
      __m128i lineM2u = _mm_loadu_si128 ((__m128i*) &pSrcM2[ y   *iSM2Stride + x]);
2503
0
      __m128i lineM2d = _mm_loadu_si128 ((__m128i*) &pSrcM2[(y+1)*iSM2Stride + x]);
2504
2505
0
      __m128i M0 = _mm_add_epi16 (lineM0u, lineM0d);
2506
0
      __m128i M1 = _mm_add_epi16 (lineM1u, lineM1d);
2507
0
      __m128i M2 = _mm_add_epi16 (lineM2u, lineM2d);
2508
2509
0
      M0 = _mm_add_epi16 (M0, M2);
2510
0
      int n=8-width+2+x;
2511
0
      if (n > 0)
2512
0
      {
2513
        //remove n Pixel
2514
0
        if (n==2)
2515
0
        {
2516
0
          M0 = _mm_slli_si128 (M0, 4);
2517
0
          M0 = _mm_srli_si128 (M0,4);
2518
0
          M1 = _mm_slli_si128 (M1, 4);
2519
0
          M1 = _mm_srli_si128 (M1,4);
2520
0
        }
2521
0
        else if  (n==4)
2522
0
        {
2523
0
          M0 = _mm_slli_si128 (M0, 8);
2524
0
          M0 = _mm_srli_si128 (M0,8);
2525
0
          M1 = _mm_slli_si128 (M1, 8);
2526
0
          M1 = _mm_srli_si128 (M1,8);
2527
0
        }
2528
0
        else if  (n==6)
2529
0
        {
2530
0
          M0 = _mm_slli_si128 (M0, 12);
2531
0
          M0 = _mm_srli_si128 (M0,12);
2532
0
          M1 = _mm_slli_si128 (M1, 12);
2533
0
          M1 = _mm_srli_si128 (M1,12);
2534
0
        }
2535
0
      }
2536
2537
0
      M0 = _mm_hadd_epi16 (M0, M1);
2538
0
      M1 = _mm_shuffle_epi32 (M0, 0xee);
2539
0
      M1 = _mm_slli_epi16 (M1, 0x1);
2540
0
      M1 = _mm_sub_epi16 (M0, M1);
2541
0
      M1 = _mm_abs_epi16 (M1);
2542
0
      M1 = _mm_hadds_epi16 (M1, M1);
2543
0
      M1 = _mm_hadds_epi16 (M1, M1);
2544
2545
0
      _mm_storeu_si16 (&act, M1);
2546
0
      taAct += (uint64_t) act;
2547
0
    }
2548
0
  }
2549
0
  return taAct ;
2550
0
}
Unexecuted instantiation: unsigned long vvenc::AvgHighPassWithDownsamplingDiff2nd_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, short const*, short const*, short const*, int, int, int)
Unexecuted instantiation: unsigned long vvenc::AvgHighPassWithDownsamplingDiff2nd_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, short const*, short const*, short const*, int, int, int)
2551
2552
template<X86_VEXT vext>
2553
void PelBufferOps::_initPelBufOpsX86()
2554
0
{
2555
0
  addAvg   = addAvg_SSE<vext>;
2556
0
  reco     = recoCore_SSE<vext>;
2557
0
  copyClip = copyClip_SSE<vext>;
2558
0
  roundGeo = roundGeo_SSE<vext>;
2559
2560
0
  addAvg4  = addAvg_SSE<vext, 4>;
2561
0
  addAvg8  = addAvg_SSE<vext, 8>;
2562
0
  addAvg16 = addAvg_SSE<vext, 16>;
2563
2564
0
  sub4 = sub_SSE<vext, 4>;
2565
0
  sub8 = sub_SSE<vext, 8>;
2566
2567
0
  copyClip4 = copyClip_SSE<vext, 4>;
2568
0
  copyClip8 = copyClip_SSE<vext, 8>;
2569
2570
0
  reco4 = reco_SSE<vext, 4>;
2571
0
  reco8 = reco_SSE<vext, 8>;
2572
2573
0
  linTf4 = linTf_SSE_entry<vext, 4>;
2574
0
  linTf8 = linTf_SSE_entry<vext, 8>;
2575
2576
0
  copyBuffer = copyBufferSimd<vext>;
2577
2578
0
#if ENABLE_SIMD_OPT_BCW
2579
0
  removeHighFreq8 = removeHighFreq_SSE<vext, 8>;
2580
0
  removeHighFreq4 = removeHighFreq_SSE<vext, 4>;
2581
2582
0
  wghtAvg4 = addWghtAvg_SSE<vext, 4>;
2583
0
  wghtAvg8 = addWghtAvg_SSE<vext, 8>;
2584
2585
0
#endif
2586
0
  transpose4x4   = transposeNxN_SSE<vext, 4>;
2587
0
  transpose8x8   = transposeNxN_SSE<vext, 8>;
2588
0
  roundIntVector = roundIntVector_SIMD<vext>;
2589
2590
0
  mipMatrixMul_4_4 = mipMatrixMul_SSE<vext, 4, 4>;
2591
0
  mipMatrixMul_8_4 = mipMatrixMul_SSE<vext, 8, 4>;
2592
0
  mipMatrixMul_8_8 = mipMatrixMul_SSE<vext, 8, 8>;
2593
2594
0
  weightCiip = weightCiip_SSE<vext>;
2595
2596
0
  applyLut = applyLut_SIMD<vext>;
2597
2598
0
  fillPtrMap = fillPtrMap_SIMD<vext>;
2599
2600
0
  AvgHighPassWithDownsampling = AvgHighPassWithDownsampling_SIMD<vext>;
2601
0
  AvgHighPass = AvgHighPass_SIMD<vext>;
2602
0
  AvgHighPassWithDownsamplingDiff1st = AvgHighPassWithDownsamplingDiff1st_SIMD<vext>;
2603
0
  AvgHighPassWithDownsamplingDiff2nd = AvgHighPassWithDownsamplingDiff2nd_SIMD<vext>;
2604
0
  HDHighPass = HDHighPass_SIMD<vext>;
2605
0
  HDHighPass2 = HDHighPass2_SIMD<vext>;
2606
0
}
Unexecuted instantiation: void vvenc::PelBufferOps::_initPelBufOpsX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::PelBufferOps::_initPelBufOpsX86<(vvenc::x86_simd::X86_VEXT)4>()
2607
2608
template void PelBufferOps::_initPelBufOpsX86<SIMDX86>();
2609
2610
2611
} // namespace vvenc
2612
2613
//! \}
2614
2615
#endif // ENABLE_SIMD_OPT_BUFFER
2616
#endif // TARGET_SIMD_X86
2617