Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/BufferX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or
4
other Intellectual Property Rights other than the copyrights concerning
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     BufferX86.h
44
    \brief    SIMD averaging.
45
*/
46
47
//! \ingroup CommonLib
48
//! \{
49
50
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1
51
52
#include "CommonLib/CommonDef.h"
53
#include "CommonDefX86.h"
54
#include "CommonLib/Unit.h"
55
#include "CommonLib/Buffer.h"
56
#include "CommonLib/InterpolationFilter.h"
57
58
#if ENABLE_SIMD_OPT_BUFFER
59
#ifdef TARGET_SIMD_X86
60
61
namespace vvdec
62
{
63
64
template< X86_VEXT vext, int W >
65
void addAvg_SSE( const int16_t* src0, ptrdiff_t src0Stride, const int16_t* src1, ptrdiff_t src1Stride, int16_t *dst, ptrdiff_t dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng )
66
0
{
67
#if USE_AVX2
68
0
  if( W == 16 )
69
0
  {
70
0
    __m256i vone      = _mm256_set1_epi16( 1 );
71
0
    __m256i voffset   = _mm256_set1_epi32( offset );
72
0
    __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
73
0
    __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
74
75
0
    for( int row = 0; row < height; row++ )
76
0
    {
77
0
      for( int col = 0; col < width; col += 16 )
78
0
      {
79
0
        __m256i vsrc0 = _mm256_loadu_si256( ( const __m256i* )&src0[col] );
80
0
        __m256i vsrc1 = _mm256_loadu_si256( ( const __m256i* )&src1[col] );
81
82
0
        __m256i vsumlo = _mm256_madd_epi16( _mm256_unpacklo_epi16( vsrc0, vsrc1 ), vone );
83
0
        __m256i vsumhi = _mm256_madd_epi16( _mm256_unpackhi_epi16( vsrc0, vsrc1 ), vone );
84
85
0
        vsumlo = _mm256_add_epi32        ( vsumlo, voffset );
86
0
        vsumhi = _mm256_add_epi32        ( vsumhi, voffset );
87
0
        vsumlo = _mm256_srai_epi32       ( vsumlo, shift );
88
0
        vsumhi = _mm256_srai_epi32       ( vsumhi, shift );
89
90
0
        __m256i vsum = _mm256_packs_epi32( vsumlo, vsumhi );
91
0
        vsum = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, vsum ) );
92
93
0
        _mm256_storeu_si256( ( __m256i * )&dst[col], vsum );
94
0
      }
95
96
0
      src0 += src0Stride;
97
0
      src1 += src1Stride;
98
0
      dst  +=  dstStride;
99
0
    }
100
0
  }
101
0
  else
102
0
#endif
103
0
  if( W >= 8 )
104
0
  {
105
0
    __m128i vone      = _mm_set1_epi16( 1 );
106
0
    __m128i voffset   = _mm_set1_epi32( offset );
107
0
    __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
108
0
    __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
109
110
0
    for( int row = 0; row < height; row++ )
111
0
    {
112
0
      for( int col = 0; col < width; col += 8 )
113
0
      {
114
0
        __m128i vsrc0 = _mm_loadu_si128( ( const __m128i* )&src0[col] );
115
0
        __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* )&src1[col] );
116
117
0
        __m128i vsumlo = _mm_madd_epi16( _mm_unpacklo_epi16( vsrc0, vsrc1 ), vone );
118
0
        __m128i vsumhi = _mm_madd_epi16( _mm_unpackhi_epi16( vsrc0, vsrc1 ), vone );
119
120
0
        vsumlo = _mm_add_epi32        ( vsumlo, voffset );
121
0
        vsumhi = _mm_add_epi32        ( vsumhi, voffset );
122
0
        vsumlo = _mm_srai_epi32       ( vsumlo, shift );
123
0
        vsumhi = _mm_srai_epi32       ( vsumhi, shift );
124
125
0
        __m128i vsum = _mm_packs_epi32( vsumlo, vsumhi );
126
0
        vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
127
128
0
        _mm_storeu_si128( ( __m128i * )&dst[col], vsum );
129
0
      }
130
131
0
      src0 += src0Stride;
132
0
      src1 += src1Stride;
133
0
      dst  +=  dstStride;
134
0
    }
135
0
  }
136
0
  else if( W == 4 )
137
0
  {
138
0
    __m128i vone      = _mm_set1_epi16( 1 );
139
0
    __m128i voffset   = _mm_set1_epi32( offset );
140
0
    __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
141
0
    __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
142
0
    __m128i vsumhi    = _mm_setzero_si128();
143
144
0
    for( int row = 0; row < height; row++ )
145
0
    {
146
0
      for( int col = 0; col < width; col += 4 )
147
0
      {
148
0
        __m128i vsrc0 = _mm_loadu_si64( ( const __m128i* )&src0[col] );
149
0
        __m128i vsrc1 = _mm_loadu_si64( ( const __m128i* )&src1[col] );
150
151
0
        __m128i vsumlo = _mm_madd_epi16( _mm_unpacklo_epi16( vsrc0, vsrc1 ), vone );
152
153
0
        vsumlo = _mm_add_epi32        ( vsumlo, voffset );
154
0
        vsumlo = _mm_srai_epi32       ( vsumlo, shift );
155
156
0
        __m128i vsum = _mm_packs_epi32( vsumlo, vsumhi );
157
0
        vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
158
159
0
        _mm_storeu_si64( ( __m128i * )&dst[col], vsum );
160
0
      }
161
162
0
      src0 += src0Stride;
163
0
      src1 += src1Stride;
164
0
      dst  +=  dstStride;
165
0
    }
166
0
  }
167
0
  else
168
0
  {
169
0
    THROW_FATAL( "Unsupported size" );
170
0
  }
171
#if USE_AVX2
172
173
0
  _mm256_zeroupper();
174
0
#endif
175
0
}
Unexecuted instantiation: void vvdec::addAvg_SSE<(vvdec::x86_simd::X86_VEXT)1, 16>(short const*, long, short const*, long, short*, long, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addAvg_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, long, short const*, long, short*, long, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addAvg_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, long, short const*, long, short*, long, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addAvg_SSE<(vvdec::x86_simd::X86_VEXT)4, 16>(short const*, long, short const*, long, short*, long, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addAvg_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, long, short const*, long, short*, long, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addAvg_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, long, short const*, long, short*, long, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
176
177
template< X86_VEXT vext, int W >
178
void reco_SSE( const int16_t* src0, ptrdiff_t src0Stride, const int16_t* src1, ptrdiff_t src1Stride, int16_t *dst, ptrdiff_t dstStride, int width, int height, const ClpRng& clpRng )
179
0
{
180
  // src0 needs to be aligned for AVX2
181
182
0
  if( W == 8 )
183
0
  {
184
#if USE_AVX2
185
0
    if( vext >= AVX2 && (width & 15) == 0 )
186
0
    {
187
0
      __m256i vbdmin = _mm256_set1_epi16( clpRng.min() );
188
0
      __m256i vbdmax = _mm256_set1_epi16( clpRng.max() );
189
190
      _mm_prefetch( ( const char* ) src0, _MM_HINT_T0 );
191
      _mm_prefetch( ( const char* ) src1, _MM_HINT_T0 );
192
      _mm_prefetch( ( const char* ) ( src0 + src0Stride ), _MM_HINT_T0 );
193
      _mm_prefetch( ( const char* ) ( src1 + src1Stride ), _MM_HINT_T0 );
194
195
0
      for( int row = 0; row < height; row++ )
196
0
      {
197
0
        _mm_prefetch( ( const char* ) ( src0 + 2 * src0Stride ), _MM_HINT_T0 );
198
0
        _mm_prefetch( ( const char* ) ( src1 + 2 * src1Stride ), _MM_HINT_T0 );
199
200
0
        for( int col = 0; col < width; col += 16 )
201
0
        {
202
0
          __m256i vdest = _mm256_loadu_si256( (const __m256i*) & src0[col] );
203
0
          __m256i vsrc1 = _mm256_loadu_si256( (const __m256i*) & src1[col] );
204
205
0
          vdest = _mm256_adds_epi16( vdest, vsrc1 );
206
0
          vdest = _mm256_min_epi16 ( vbdmax, _mm256_max_epi16( vbdmin, vdest ) );
207
208
0
          _mm256_storeu_si256( (__m256i*) & dst[col], vdest );
209
0
        }
210
211
0
        src0 += src0Stride;
212
0
        src1 += src1Stride;
213
0
        dst += dstStride;
214
0
      }
215
0
    }
216
0
    else
217
0
#endif
218
0
    {
219
0
      __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
220
0
      __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
221
222
0
      for( int row = 0; row < height; row++ )
223
0
      {
224
0
        for( int col = 0; col < width; col += 8 )
225
0
        {
226
0
          __m128i vdest = _mm_loadu_si128( ( const __m128i * )&src0[col] );
227
0
          __m128i vsrc1 = _mm_loadu_si128( ( const __m128i * )&src1[col] );
228
229
0
          vdest = _mm_adds_epi16( vdest, vsrc1 );
230
0
          vdest = _mm_min_epi16 ( vbdmax, _mm_max_epi16( vbdmin, vdest ) );
231
232
0
          _mm_storeu_si128( ( __m128i * )&dst[col], vdest );
233
0
        }
234
235
0
        src0 += src0Stride;
236
0
        src1 += src1Stride;
237
0
        dst  += dstStride;
238
0
      }
239
0
    }
240
0
  }
241
0
  else if( W == 4 )
242
0
  {
243
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
244
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
245
246
0
    for( int row = 0; row < height; row++ )
247
0
    {
248
0
      for( int col = 0; col < width; col += 4 )
249
0
      {
250
0
        __m128i vsrc = _mm_loadu_si64( ( const __m128i * )&src0[col] );
251
0
        __m128i vdst = _mm_loadu_si64( ( const __m128i * )&src1[col] );
252
253
0
        vdst = _mm_adds_epi16( vdst, vsrc );
254
0
        vdst = _mm_min_epi16 ( vbdmax, _mm_max_epi16( vbdmin, vdst ) );
255
256
0
        _mm_storeu_si64( ( __m128i * )&dst[col], vdst );
257
0
      }
258
259
0
      src0 += src0Stride;
260
0
      src1 += src1Stride;
261
0
      dst  +=  dstStride;
262
0
    }
263
0
  }
264
0
  else
265
0
  {
266
0
    THROW_FATAL( "Unsupported size" );
267
0
  }
268
#if USE_AVX2
269
270
0
  _mm256_zeroupper();
271
0
#endif
272
0
}
Unexecuted instantiation: void vvdec::reco_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, long, short const*, long, short*, long, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::reco_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, long, short const*, long, short*, long, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::reco_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, long, short const*, long, short*, long, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::reco_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, long, short const*, long, short*, long, int, int, vvdec::ClpRngTemplate<short> const&)
273
274
template< X86_VEXT vext, int W >
275
void addWghtAvg_SSE( const int16_t* src0, ptrdiff_t src0Stride, const int16_t* src1, ptrdiff_t src1Stride, int16_t *dst, ptrdiff_t dstStride, int width, int height, int shift, int offset, int w0, int w1, const ClpRng& clpRng )
276
0
{
277
0
  if( W == 8 )
278
0
  {
279
#if USE_AVX2
280
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
281
0
    {
282
0
      __m256i voffset  = _mm256_set1_epi32( offset );
283
0
      __m256i vibdimin = _mm256_set1_epi16( clpRng.min() );
284
0
      __m256i vibdimax = _mm256_set1_epi16( clpRng.max() );
285
0
      __m256i vw       = _mm256_unpacklo_epi16( _mm256_set1_epi16( w0 ), _mm256_set1_epi16( w1 ) );
286
287
0
      for( int row = 0; row < height; row++ )
288
0
      {
289
0
        for( int col = 0; col < width; col += 16 )
290
0
        {
291
0
          __m256i vsrc0 = _mm256_loadu_si256( ( const __m256i * )&src0[col] );
292
0
          __m256i vsrc1 = _mm256_loadu_si256( ( const __m256i * )&src1[col] );
293
294
0
          __m256i vtmp, vsum;
295
0
          vsum = _mm256_madd_epi16       ( vw, _mm256_unpacklo_epi16( vsrc0, vsrc1 ) );
296
0
          vsum = _mm256_add_epi32        ( vsum, voffset );
297
0
          vtmp = _mm256_srai_epi32       ( vsum, shift );
298
299
0
          vsum = _mm256_madd_epi16       ( vw, _mm256_unpackhi_epi16( vsrc0, vsrc1 ) );
300
0
          vsum = _mm256_add_epi32        ( vsum, voffset );
301
0
          vsum = _mm256_srai_epi32       ( vsum, shift );
302
0
          vsum = _mm256_packs_epi32      ( vtmp, vsum );
303
304
0
          vsum = _mm256_min_epi16( vibdimax, _mm256_max_epi16( vibdimin, vsum ) );
305
0
          _mm256_storeu_si256( ( __m256i * )&dst[col], vsum );
306
0
        }
307
308
0
        src0 += src0Stride;
309
0
        src1 += src1Stride;
310
0
        dst  +=  dstStride;
311
0
      }
312
0
    }
313
0
    else
314
0
#endif
315
0
    {
316
0
      __m128i voffset  = _mm_set1_epi32( offset );
317
0
      __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
318
0
      __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
319
0
      __m128i vw       = _mm_unpacklo_epi16( _mm_set1_epi16( w0 ), _mm_set1_epi16( w1 ) );
320
321
0
      for( int row = 0; row < height; row++ )
322
0
      {
323
0
        for( int col = 0; col < width; col += 8 )
324
0
        {
325
0
          __m128i vsrc0 = _mm_loadu_si128( ( const __m128i * )&src0[col] );
326
0
          __m128i vsrc1 = _mm_loadu_si128( ( const __m128i * )&src1[col] );
327
328
0
          __m128i vtmp, vsum;
329
0
          vsum = _mm_madd_epi16       ( vw, _mm_unpacklo_epi16( vsrc0, vsrc1 ) );
330
0
          vsum = _mm_add_epi32        ( vsum, voffset );
331
0
          vtmp = _mm_srai_epi32       ( vsum, shift );
332
333
0
          vsum = _mm_madd_epi16       ( vw, _mm_unpackhi_epi16( vsrc0, vsrc1 ) );
334
0
          vsum = _mm_add_epi32        ( vsum, voffset );
335
0
          vsum = _mm_srai_epi32       ( vsum, shift );
336
0
          vsum = _mm_packs_epi32      ( vtmp, vsum );
337
338
0
          vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
339
0
          _mm_storeu_si128( ( __m128i * )&dst[col], vsum );
340
0
        }
341
342
0
        src0 += src0Stride;
343
0
        src1 += src1Stride;
344
0
        dst  +=  dstStride;
345
0
      }
346
0
    }
347
0
  }
348
0
  else if( W == 4 )
349
0
  {
350
0
    __m128i vzero     = _mm_setzero_si128();
351
0
    __m128i voffset   = _mm_set1_epi32( offset );
352
0
    __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
353
0
    __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
354
0
    __m128i vw        = _mm_unpacklo_epi16( _mm_set1_epi16( w0 ), _mm_set1_epi16( w1 ) );
355
356
0
    for( int row = 0; row < height; row++ )
357
0
    {
358
0
      for( int col = 0; col < width; col += 4 )
359
0
      {
360
0
        __m128i vsum = _mm_loadu_si64  ( ( const __m128i * )&src0[col] );
361
0
        __m128i vdst = _mm_loadu_si64  ( ( const __m128i * )&src1[col] );
362
0
        vsum = _mm_madd_epi16          ( vw, _mm_unpacklo_epi16( vsum, vdst ) );
363
0
        vsum = _mm_add_epi32           ( vsum, voffset );
364
0
        vsum = _mm_srai_epi32          ( vsum, shift );
365
0
        vsum = _mm_packs_epi32         ( vsum, vzero );
366
367
0
        vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
368
0
        _mm_storeu_si64( ( __m128i * )&dst[col], vsum );
369
0
      }
370
371
0
      src0 += src0Stride;
372
0
      src1 += src1Stride;
373
0
      dst  +=  dstStride;
374
0
    }
375
0
  }
376
0
  else
377
0
  {
378
0
    THROW_FATAL( "Unsupported size" );
379
0
  }
380
#if USE_AVX2
381
382
0
  _mm256_zeroupper();
383
0
#endif
384
0
}
Unexecuted instantiation: void vvdec::addWghtAvg_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, long, short const*, long, short*, long, int, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addWghtAvg_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, long, short const*, long, short*, long, int, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addWghtAvg_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, long, short const*, long, short*, long, int, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::addWghtAvg_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, long, short const*, long, short*, long, int, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
385
386
template<bool doShift, bool shiftR, typename T> static inline void do_shift( T &vreg, int num );
387
#if USE_AVX2
388
0
template<> inline void do_shift<true,  true , __m256i>( __m256i &vreg, int num ) { vreg = _mm256_sra_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
389
0
template<> inline void do_shift<true,  false, __m256i>( __m256i &vreg, int num ) { vreg = _mm256_sll_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
390
0
template<> inline void do_shift<false, true , __m256i>( __m256i &vreg, int num ) { }
391
0
template<> inline void do_shift<false, false, __m256i>( __m256i &vreg, int num ) { }
392
#endif
393
0
template<> inline void do_shift<true,  true , __m128i>( __m128i &vreg, int num ) { vreg = _mm_sra_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_shift<true, true, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_shift<true, true, long long __vector(2)>(long long __vector(2)&, int)
394
0
template<> inline void do_shift<true,  false, __m128i>( __m128i &vreg, int num ) { vreg = _mm_sll_epi32( vreg, _mm_cvtsi32_si128( num ) ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_shift<true, false, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_shift<true, false, long long __vector(2)>(long long __vector(2)&, int)
395
0
template<> inline void do_shift<false, true , __m128i>( __m128i &vreg, int num ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_shift<false, true, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_shift<false, true, long long __vector(2)>(long long __vector(2)&, int)
396
0
template<> inline void do_shift<false, false, __m128i>( __m128i &vreg, int num ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_shift<false, false, long long __vector(2)>(long long __vector(2)&, int)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_shift<false, false, long long __vector(2)>(long long __vector(2)&, int)
397
398
template<bool mult, typename T> static inline void do_mult( T& vreg, T& vmult );
399
0
template<> inline void do_mult<false, __m128i>( __m128i&, __m128i& ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_mult<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_mult<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
400
#if USE_AVX2
401
0
template<> inline void do_mult<false, __m256i>( __m256i&, __m256i& ) { }
402
#endif
403
0
template<> inline void do_mult<true,   __m128i>( __m128i& vreg, __m128i& vmult ) { vreg = _mm_mullo_epi32   ( vreg, vmult ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_mult<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_mult<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
404
#if USE_AVX2
405
0
template<> inline void do_mult<true,   __m256i>( __m256i& vreg, __m256i& vmult ) { vreg = _mm256_mullo_epi32( vreg, vmult ); }
406
#endif
407
408
template<bool add, typename T> static inline void do_add( T& vreg, T& vadd );
409
0
template<> inline void do_add<false, __m128i>( __m128i&, __m128i& ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_add<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_add<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
410
#if USE_AVX2
411
0
template<> inline void do_add<false, __m256i>( __m256i&, __m256i& ) { }
412
#endif
413
0
template<> inline void do_add<true,  __m128i>( __m128i& vreg, __m128i& vadd ) { vreg = _mm_add_epi32( vreg, vadd ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_add<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_add<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&)
414
#if USE_AVX2
415
0
template<> inline void do_add<true,  __m256i>( __m256i& vreg, __m256i& vadd ) { vreg = _mm256_add_epi32( vreg, vadd ); }
416
#endif
417
418
template<bool clip, typename T> static inline void do_clip( T& vreg, T& vbdmin, T& vbdmax );
419
0
template<> inline void do_clip<false, __m128i>( __m128i&, __m128i&, __m128i& ) { }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_clip<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_clip<false, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
420
0
template<> inline void do_clip<true,  __m128i>( __m128i& vreg, __m128i& vbdmin, __m128i& vbdmax ) { vreg = _mm_min_epi16   ( vbdmax, _mm_max_epi16   ( vbdmin, vreg ) ); }
Unexecuted instantiation: Buffer_sse41.cpp:void vvdec::do_clip<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
Unexecuted instantiation: Buffer_avx2.cpp:void vvdec::do_clip<true, long long __vector(2)>(long long __vector(2)&, long long __vector(2)&, long long __vector(2)&)
421
422
423
template<X86_VEXT vext, int W, bool doAdd, bool doMult, bool doShift, bool shiftR, bool clip>
424
void linTf_SSE( const int16_t* src, ptrdiff_t srcStride, int16_t* dst, ptrdiff_t dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng )
425
0
{
426
0
  if( vext >= AVX2 && ( width & 7 ) == 0 && W == 8 )
427
0
  {
428
#if USE_AVX2
429
    __m128i xbdmin   = _mm_set1_epi16( clpRng.min() );
430
    __m128i xbdmax   = _mm_set1_epi16( clpRng.max() );
431
    __m256i voffset  = _mm256_set1_epi32( offset );
432
    __m256i vscale   = _mm256_set1_epi32( scale );
433
434
0
    for( int row = 0; row < height; row++ )
435
0
    {
436
0
      for( int col = 0; col < width; col += 8 )
437
0
      {
438
0
        __m256i val;
439
0
        val = _mm256_cvtepi16_epi32       (  _mm_lddqu_si128( ( const __m128i * )&src[col] ) );
440
0
        do_mult<doMult, __m256i>          ( val, vscale );
441
0
        do_shift<doShift, shiftR, __m256i>( val, shift );
442
0
        do_add<doAdd, __m256i>            ( val, voffset );
443
0
        __m128i
444
0
        xal = _mm256_cvtepi32_epi16x      ( val );
445
0
        do_clip<clip, __m128i>            ( xal, xbdmin, xbdmax );
446
447
0
        _mm_storeu_si128                  ( ( __m128i * )&dst[col], xal );
448
0
      }
449
450
0
      src += srcStride;
451
0
      dst += dstStride;
452
0
    }
453
#endif
454
0
  }
455
0
  else
456
0
  {
457
0
    __m128i vzero   = _mm_setzero_si128();
458
0
    __m128i vbdmin  = _mm_set1_epi16   ( clpRng.min() );
459
0
    __m128i vbdmax  = _mm_set1_epi16   ( clpRng.max() );
460
0
    __m128i voffset = _mm_set1_epi32   ( offset );
461
0
    __m128i vscale  = _mm_set1_epi32   ( scale );
462
463
0
    for( int row = 0; row < height; row++ )
464
0
    {
465
0
      for( int col = 0; col < width; col += 4 )
466
0
      {
467
0
        __m128i val;
468
0
        val = _mm_loadu_si64             ( ( const __m128i * )&src[col] );
469
0
        val = _mm_cvtepi16_epi32          ( val );
470
0
        do_mult<doMult, __m128i>          ( val, vscale );
471
0
        do_shift<doShift, shiftR, __m128i>( val, shift );
472
0
        do_add<doAdd, __m128i>            ( val, voffset );
473
0
        val = _mm_packs_epi32             ( val, vzero );
474
0
        do_clip<clip, __m128i>            ( val, vbdmin, vbdmax );
475
476
0
        _mm_storeu_si64                  ( ( __m128i * )&dst[col], val );
477
0
      }
478
479
0
      src += srcStride;
480
0
      dst += dstStride;
481
0
    }
482
0
  }
483
#if USE_AVX2
484
485
  _mm256_zeroupper();
486
#endif
487
0
}
Unexecuted instantiation: void vvdec::linTf_SSE<(vvdec::x86_simd::X86_VEXT)1, 8, true, true, true, true, true>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::linTf_SSE<(vvdec::x86_simd::X86_VEXT)1, 4, true, true, true, true, true>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::linTf_SSE<(vvdec::x86_simd::X86_VEXT)4, 8, true, true, true, true, true>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::linTf_SSE<(vvdec::x86_simd::X86_VEXT)4, 4, true, true, true, true, true>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&)
488
489
template<X86_VEXT vext, int W>
490
void linTf_SSE_entry( const int16_t* src, ptrdiff_t srcStride, int16_t* dst, ptrdiff_t dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool clip )
491
0
{
492
0
  linTf_SSE<vext, W, true,  true,  true,  true,  true >( src, srcStride, dst, dstStride, width, height, scale,  shift, offset, clpRng );
493
0
}
Unexecuted instantiation: void vvdec::linTf_SSE_entry<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&, bool)
Unexecuted instantiation: void vvdec::linTf_SSE_entry<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&, bool)
Unexecuted instantiation: void vvdec::linTf_SSE_entry<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&, bool)
Unexecuted instantiation: void vvdec::linTf_SSE_entry<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, long, short*, long, int, int, int, int, int, vvdec::ClpRngTemplate<short> const&, bool)
494
495
template<X86_VEXT vext, int W>
496
void transposePel_SSE( const Pel* src, ptrdiff_t srcStride, Pel* dst, ptrdiff_t dstStride )
497
0
{
498
0
  if( W == 4 )
499
0
  {
500
0
    __m128i va, vb, vc, vd;
501
502
0
    va = _mm_loadu_si64( ( const __m128i* ) src ); src += srcStride;
503
0
    vb = _mm_loadu_si64( ( const __m128i* ) src ); src += srcStride;
504
0
    vc = _mm_loadu_si64( ( const __m128i* ) src ); src += srcStride;
505
0
    vd = _mm_loadu_si64( ( const __m128i* ) src );
506
507
0
    __m128i va01b01 = _mm_unpacklo_epi16( va,      vb );
508
0
    __m128i va23b23 = _mm_unpackhi_epi64( va01b01, vb );
509
0
    __m128i vc01d01 = _mm_unpacklo_epi16( vc,      vd );
510
0
    __m128i vc23d23 = _mm_unpackhi_epi64( vc01d01, vd );
511
512
0
    va = _mm_unpacklo_epi32( va01b01, vc01d01 );
513
0
    vb = _mm_unpackhi_epi64( va,      va );
514
0
    vc = _mm_unpacklo_epi32( va23b23, vc23d23 );
515
0
    vd = _mm_unpackhi_epi64( vc,      vc );
516
517
0
    _mm_storeu_si64( ( __m128i* ) dst, va ); dst += dstStride;
518
0
    _mm_storeu_si64( ( __m128i* ) dst, vb ); dst += dstStride;
519
0
    _mm_storeu_si64( ( __m128i* ) dst, vc ); dst += dstStride;
520
0
    _mm_storeu_si64( ( __m128i* ) dst, vd );
521
0
  }
522
0
  else if( W == 8 )
523
0
  {
524
0
    __m128i va, vb, vc, vd, ve, vf, vg, vh;
525
526
0
    va = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
527
0
    vb = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
528
0
    vc = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
529
0
    vd = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
530
0
    ve = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
531
0
    vf = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
532
0
    vg = _mm_loadu_si128( ( const __m128i* ) src ); src += srcStride;
533
0
    vh = _mm_loadu_si128( ( const __m128i* ) src );
534
535
0
    __m128i va01b01 = _mm_unpacklo_epi16( va, vb );
536
0
    __m128i va23b23 = _mm_unpackhi_epi16( va, vb );
537
0
    __m128i vc01d01 = _mm_unpacklo_epi16( vc, vd );
538
0
    __m128i vc23d23 = _mm_unpackhi_epi16( vc, vd );
539
0
    __m128i ve01f01 = _mm_unpacklo_epi16( ve, vf );
540
0
    __m128i ve23f23 = _mm_unpackhi_epi16( ve, vf );
541
0
    __m128i vg01h01 = _mm_unpacklo_epi16( vg, vh );
542
0
    __m128i vg23h23 = _mm_unpackhi_epi16( vg, vh );
543
544
0
    va = _mm_unpacklo_epi32( va01b01, vc01d01 );
545
0
    vb = _mm_unpackhi_epi32( va01b01, vc01d01 );
546
0
    vc = _mm_unpacklo_epi32( va23b23, vc23d23 );
547
0
    vd = _mm_unpackhi_epi32( va23b23, vc23d23 );
548
0
    ve = _mm_unpacklo_epi32( ve01f01, vg01h01 );
549
0
    vf = _mm_unpackhi_epi32( ve01f01, vg01h01 );
550
0
    vg = _mm_unpacklo_epi32( ve23f23, vg23h23 );
551
0
    vh = _mm_unpackhi_epi32( ve23f23, vg23h23 );
552
553
0
    va01b01 = _mm_unpacklo_epi64( va, ve );
554
0
    va23b23 = _mm_unpackhi_epi64( va, ve );
555
0
    vc01d01 = _mm_unpacklo_epi64( vb, vf );
556
0
    vc23d23 = _mm_unpackhi_epi64( vb, vf );
557
0
    ve01f01 = _mm_unpacklo_epi64( vc, vg );
558
0
    ve23f23 = _mm_unpackhi_epi64( vc, vg );
559
0
    vg01h01 = _mm_unpacklo_epi64( vd, vh );
560
0
    vg23h23 = _mm_unpackhi_epi64( vd, vh );
561
562
0
    _mm_storeu_si128( ( __m128i* ) dst, va01b01 ); dst += dstStride;
563
0
    _mm_storeu_si128( ( __m128i* ) dst, va23b23 ); dst += dstStride;
564
0
    _mm_storeu_si128( ( __m128i* ) dst, vc01d01 ); dst += dstStride;
565
0
    _mm_storeu_si128( ( __m128i* ) dst, vc23d23 ); dst += dstStride;
566
0
    _mm_storeu_si128( ( __m128i* ) dst, ve01f01 ); dst += dstStride;
567
0
    _mm_storeu_si128( ( __m128i* ) dst, ve23f23 ); dst += dstStride;
568
0
    _mm_storeu_si128( ( __m128i* ) dst, vg01h01 ); dst += dstStride;
569
0
    _mm_storeu_si128( ( __m128i* ) dst, vg23h23 );
570
0
  }
571
#if USE_AVX2
572
573
  _mm256_zeroupper();
574
#endif
575
0
}
Unexecuted instantiation: void vvdec::transposePel_SSE<(vvdec::x86_simd::X86_VEXT)1, 4>(short const*, long, short*, long)
Unexecuted instantiation: void vvdec::transposePel_SSE<(vvdec::x86_simd::X86_VEXT)1, 8>(short const*, long, short*, long)
Unexecuted instantiation: void vvdec::transposePel_SSE<(vvdec::x86_simd::X86_VEXT)4, 4>(short const*, long, short*, long)
Unexecuted instantiation: void vvdec::transposePel_SSE<(vvdec::x86_simd::X86_VEXT)4, 8>(short const*, long, short*, long)
576
577
template<X86_VEXT vext>
578
void copyBuffer_SSE( const char *src, ptrdiff_t srcStride, char *dst, ptrdiff_t dstStride, int width, int height )
579
0
{
580
0
  _mm_prefetch( (const char *) ( src             ), _MM_HINT_T0 );
581
0
  _mm_prefetch( (const char *) ( src + srcStride ), _MM_HINT_T0 );
582
583
0
  if( width == srcStride && width == dstStride )
584
0
  {
585
0
    memcpy( dst, src, width * height );
586
0
    return;
587
0
  }
588
589
0
  while( height-- )
590
0
  {
591
0
    const char* nextSrcLine = src + srcStride;
592
0
          char* nextDstLine = dst + dstStride;
593
594
0
    _mm_prefetch( nextSrcLine, _MM_HINT_T0 );
595
596
0
    memcpy( dst, src, width );
597
598
0
    src = nextSrcLine;
599
0
    dst = nextDstLine;
600
0
  }
601
0
}
Unexecuted instantiation: void vvdec::copyBuffer_SSE<(vvdec::x86_simd::X86_VEXT)1>(char const*, long, char*, long, int, int)
Unexecuted instantiation: void vvdec::copyBuffer_SSE<(vvdec::x86_simd::X86_VEXT)4>(char const*, long, char*, long, int, int)
602
603
template<X86_VEXT vext>
604
void applyLut_SIMD( Pel* ptr, ptrdiff_t ptrStride, int width, int height, const Pel* lut )
605
0
{
606
0
  _mm_prefetch( ( const char* ) &ptr[0 * ptrStride], _MM_HINT_T0 );
607
0
  _mm_prefetch( ( const char* ) &ptr[1 * ptrStride], _MM_HINT_T0 );
608
0
  _mm_prefetch( ( const char* ) &ptr[0 * ptrStride + (width >> 1)], _MM_HINT_T0 );
609
0
  _mm_prefetch( ( const char* ) &ptr[1 * ptrStride + (width >> 1)], _MM_HINT_T0 );
610
611
0
#if USE_AVX2
612
0
  if( ( width & 15 ) == 0 && ( height & 1 ) == 0 )
613
0
  {
614
0
    const __m256i vLutShuf = _mm256_setr_epi8( 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 );
615
616
0
    for( int y = 0; y < height; y += 2 )
617
0
    {
618
0
      _mm_prefetch( ( const char* ) &ptr[2 * ptrStride], _MM_HINT_T0 );
619
0
      _mm_prefetch( ( const char* ) &ptr[3 * ptrStride], _MM_HINT_T0 );
620
0
      _mm_prefetch( ( const char* ) &ptr[2 * ptrStride + ( width >> 1 )], _MM_HINT_T0 );
621
0
      _mm_prefetch( ( const char* ) &ptr[3 * ptrStride + ( width >> 1 )], _MM_HINT_T0 );
622
623
0
      for( int x = 0; x < width; x += 16 )
624
0
      {
625
0
        __m256i vin16    = _mm256_load_si256        ( ( const __m256i * ) &ptr[x] );
626
0
        __m256i vin16x   = _mm256_load_si256        ( ( const __m256i * ) &ptr[x + ptrStride] );
627
628
0
        __m256i vin32_1  = _mm256_unpacklo_epi16    ( vin16,  _mm256_setzero_si256() );
629
0
        __m256i vin32_2  = _mm256_unpackhi_epi16    ( vin16,  _mm256_setzero_si256() );
630
0
        __m256i vin32_1x = _mm256_unpacklo_epi16    ( vin16x, _mm256_setzero_si256() );
631
0
        __m256i vin32_2x = _mm256_unpackhi_epi16    ( vin16x, _mm256_setzero_si256() );
632
633
0
        __m256i vout32_1 = _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_1,  2 );
634
0
        __m256i vout32_2 = _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_2,  2 );
635
0
        __m256i vout32_1x= _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_1x, 2 );
636
0
        __m256i vout32_2x= _mm256_i32gather_epi32   ( ( const int * ) lut, vin32_2x, 2 );
637
638
0
        vout32_1         = _mm256_shuffle_epi8      ( vout32_1,  vLutShuf );
639
0
        vout32_2         = _mm256_shuffle_epi8      ( vout32_2,  vLutShuf );
640
0
        vout32_1x        = _mm256_shuffle_epi8      ( vout32_1x, vLutShuf );
641
0
        vout32_2x        = _mm256_shuffle_epi8      ( vout32_2x, vLutShuf );
642
643
0
        __m256i vout16   = _mm256_unpacklo_epi64    ( vout32_1,  vout32_2 );
644
0
        __m256i vout16x  = _mm256_unpacklo_epi64    ( vout32_1x, vout32_2x );
645
646
0
        _mm256_store_si256( ( __m256i * ) &ptr[x],             vout16 );
647
0
        _mm256_store_si256( ( __m256i * ) &ptr[x + ptrStride], vout16x );
648
0
      }
649
650
0
      ptr += ( ptrStride << 1 );
651
0
    }
652
653
0
    _mm256_zeroupper();
654
0
  }
655
0
  else
656
0
#endif
657
0
  {
658
0
#define RSP_SGNL_OP( ADDR ) ptr[ADDR] = lut[ptr[ADDR]]
659
0
#define RSP_SGNL_INC        ptr      += ptrStride;
660
661
0
    SIZE_AWARE_PER_EL_OP( RSP_SGNL_OP, RSP_SGNL_INC )
662
663
0
#undef RSP_SGNL_OP
664
0
#undef RSP_SGNL_INC
665
0
  }
666
0
}
Unexecuted instantiation: void vvdec::applyLut_SIMD<(vvdec::x86_simd::X86_VEXT)1>(short*, long, int, int, short const*)
Unexecuted instantiation: void vvdec::applyLut_SIMD<(vvdec::x86_simd::X86_VEXT)4>(short*, long, int, int, short const*)
667
668
template<X86_VEXT vext>
669
void rspBcwCore_SIMD( Pel* ptr, ptrdiff_t ptrStride, int width, int height, const int bd, const int minBin, const int maxBin, const Pel* LmcsPivot, const Pel* InvScCoeff, const Pel* InputPivot )
670
0
{
671
0
  const int effMaxBin = maxBin < PIC_CODE_CW_BINS - 1 ? maxBin + 1 : maxBin;
672
673
0
  _mm_prefetch( ( const char* ) ( ptr + 0 * ptrStride ), _MM_HINT_T0 );
674
0
  _mm_prefetch( ( const char* ) ( ptr + 1 * ptrStride ), _MM_HINT_T0 );
675
0
  _mm_prefetch( ( const char* ) ( ptr + 2 * ptrStride ), _MM_HINT_T0 );
676
0
  _mm_prefetch( ( const char* ) ( ptr + 3 * ptrStride ), _MM_HINT_T0 );
677
678
#if USE_AVX2
679
  if( ( width & 15 ) == 0 && vext >= AVX2 )
680
  {
681
    __m128i xtmp1, xtmp2, xtmp3, xtmp4;
682
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[0] );
683
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[8] );
684
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
685
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
686
    xtmp1 = _mm_unpacklo_epi64( xtmp3, xtmp4 );
687
    xtmp2 = _mm_unpackhi_epi64( xtmp3, xtmp4 );
688
689
    const __m256i mInputPivotLo = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp1 ), xtmp1, 1 );
690
    const __m256i mInputPivotHi = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp2 ), xtmp2, 1 );
691
692
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &InvScCoeff[0] );
693
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &InvScCoeff[8] );
694
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
695
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
696
    xtmp1 = _mm_unpacklo_epi64( xtmp3, xtmp4 );
697
    xtmp2 = _mm_unpackhi_epi64( xtmp3, xtmp4 );
698
699
    const __m256i mScaleCoeffLo = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp1 ), xtmp1, 1 );
700
    const __m256i mScaleCoeffHi = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp2 ), xtmp2, 1 );
701
702
    const __m256i mMin = _mm256_setzero_si256();
703
    const __m256i mMax = _mm256_set1_epi16( ( 1 << bd ) - 1 );
704
705
    for( int y = 0; y < height; y += 4 )
706
    {
707
      _mm_prefetch( ( const char* ) ( ptr + 4 * ptrStride ), _MM_HINT_T0 );
708
      _mm_prefetch( ( const char* ) ( ptr + 5 * ptrStride ), _MM_HINT_T0 );
709
      _mm_prefetch( ( const char* ) ( ptr + 6 * ptrStride ), _MM_HINT_T0 );
710
      _mm_prefetch( ( const char* ) ( ptr + 7 * ptrStride ), _MM_HINT_T0 );
711
712
      for( int x = 0; x < width; x += 16 )
713
      {
714
        const __m256i xsrc0 = _mm256_load_si256( ( const __m256i* ) ( ptr + x + 0 * ptrStride ) );
715
        const __m256i xsrc1 = _mm256_load_si256( ( const __m256i* ) ( ptr + x + 1 * ptrStride ) );
716
        const __m256i xsrc2 = _mm256_load_si256( ( const __m256i* ) ( ptr + x + 2 * ptrStride ) );
717
        const __m256i xsrc3 = _mm256_load_si256( ( const __m256i* ) ( ptr + x + 3 * ptrStride ) );
718
719
        __m256i diff0 = _mm256_min_epi16( xsrc0, xsrc1 );
720
        __m256i diff2 = _mm256_min_epi16( xsrc2, xsrc3 );
721
        __m256i diff1, diff3;
722
723
        diff0   = _mm256_min_epi16( diff0, diff2 );
724
        __m128i
725
        diffx   = _mm_minpos_epu16( _mm_min_epi16( _mm256_castsi256_si128  ( diff0 ),
726
                                                   _mm256_extracti128_si256( diff0, 1 ) ) );
727
        int min = _mm_extract_epi16( diffx, 0 );
728
729
        int i = minBin;
730
        switch( effMaxBin - minBin )
731
        {
732
        default:
733
        case 15: if( min < LmcsPivot[++i] ) { break; };
734
        case 14: if( min < LmcsPivot[++i] ) { break; };
735
        case 13: if( min < LmcsPivot[++i] ) { break; };
736
        case 12: if( min < LmcsPivot[++i] ) { break; };
737
        case 11: if( min < LmcsPivot[++i] ) { break; };
738
        case 10: if( min < LmcsPivot[++i] ) { break; };
739
        case  9: if( min < LmcsPivot[++i] ) { break; };
740
        case  8: if( min < LmcsPivot[++i] ) { break; };
741
        case  7: if( min < LmcsPivot[++i] ) { break; };
742
        case  6: if( min < LmcsPivot[++i] ) { break; };
743
        case  5: if( min < LmcsPivot[++i] ) { break; };
744
        case  4: if( min < LmcsPivot[++i] ) { break; };
745
        case  3: if( min < LmcsPivot[++i] ) { break; };
746
        case  2: if( min < LmcsPivot[++i] ) { break; };
747
        case  1: if( min < LmcsPivot[++i] ) { break; };
748
        case  0: if( min < LmcsPivot[++i] ) { break; };
749
        }
750
751
        --i;
752
753
        __m256i xidx0 = _mm256_set1_epi16( i );
754
        __m256i xidx1 = xidx0;
755
        __m256i xidx2 = xidx0;
756
        __m256i xidx3 = xidx0;
757
758
        __m256i xlmcs = _mm256_set1_epi16( LmcsPivot[i] );
759
760
        diff0 = _mm256_sub_epi16( xsrc0, xlmcs );
761
        diff1 = _mm256_sub_epi16( xsrc1, xlmcs );
762
        diff2 = _mm256_sub_epi16( xsrc2, xlmcs );
763
        diff3 = _mm256_sub_epi16( xsrc3, xlmcs );
764
765
        for( ++i; i <= effMaxBin; ++i )
766
        {
767
          __m256i
768
          xlmcs         = _mm256_set1_epi16( LmcsPivot[i] );
769
770
          __m256i currd = _mm256_sub_epi16( xsrc0, xlmcs );
771
          diff0         = _mm256_min_epu16( diff0, currd );
772
          __m256i chnd0 = _mm256_cmpeq_epi16( currd, diff0 );
773
774
          currd         = _mm256_sub_epi16( xsrc1, xlmcs );
775
          diff1         = _mm256_min_epu16( diff1, currd );
776
          __m256i chnd1 = _mm256_cmpeq_epi16( currd, diff1 );
777
778
          currd         = _mm256_sub_epi16( xsrc2, xlmcs );
779
          diff2         = _mm256_min_epu16( diff2, currd );
780
          __m256i chnd2 = _mm256_cmpeq_epi16( currd, diff2 );
781
782
          currd         = _mm256_sub_epi16( xsrc3, xlmcs );
783
          diff3         = _mm256_min_epu16( diff3, currd );
784
          __m256i chnd3 = _mm256_cmpeq_epi16( currd, diff3 );
785
786
          xidx0         = _mm256_sub_epi16( xidx0, chnd0 );
787
          xidx1         = _mm256_sub_epi16( xidx1, chnd1 );
788
          xidx2         = _mm256_sub_epi16( xidx2, chnd2 );
789
          xidx3         = _mm256_sub_epi16( xidx3, chnd3 );
790
791
          chnd0         = _mm256_or_si256( chnd0, chnd1 );
792
          chnd2         = _mm256_or_si256( chnd2, chnd3 );
793
          chnd0         = _mm256_or_si256( chnd0, chnd2 );
794
795
          if( _mm256_movemask_epi8( chnd0 ) == 0 ) break;
796
        }
797
798
        xidx0 = _mm256_packs_epi16( xidx0, _mm256_set1_epi8( -1 ) );
799
        xidx1 = _mm256_packs_epi16( xidx1, _mm256_set1_epi8( -1 ) );
800
        xidx2 = _mm256_packs_epi16( xidx2, _mm256_set1_epi8( -1 ) );
801
        xidx3 = _mm256_packs_epi16( xidx3, _mm256_set1_epi8( -1 ) );
802
803
        __m256i xinp = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mInputPivotLo, xidx0 ), _mm256_shuffle_epi8( mInputPivotHi, xidx0 ) );
804
        __m256i xscl = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mScaleCoeffLo, xidx0 ), _mm256_shuffle_epi8( mScaleCoeffHi, xidx0 ) );
805
806
        __m256i
807
        xtmp1 = _mm256_slli_epi16( diff0, 4 );
808
        xtmp1 = _mm256_mulhrs_epi16( xtmp1, xscl );
809
810
        xtmp1 = _mm256_add_epi16( xinp, xtmp1 );
811
812
        xtmp1 = _mm256_min_epi16( xtmp1, mMax );
813
        xtmp1 = _mm256_max_epi16( xtmp1, mMin );
814
815
        _mm256_store_si256( ( __m256i * ) &ptr[x], xtmp1 );
816
817
        xinp = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mInputPivotLo, xidx1 ), _mm256_shuffle_epi8( mInputPivotHi, xidx1 ) );
818
        xscl = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mScaleCoeffLo, xidx1 ), _mm256_shuffle_epi8( mScaleCoeffHi, xidx1 ) );
819
820
        xtmp1 = _mm256_slli_epi16( diff1, 4 );
821
        xtmp1 = _mm256_mulhrs_epi16( xtmp1, xscl );
822
823
        xtmp1 = _mm256_add_epi16( xinp, xtmp1 );
824
825
        xtmp1 = _mm256_min_epi16( xtmp1, mMax );
826
        xtmp1 = _mm256_max_epi16( xtmp1, mMin );
827
828
        _mm256_store_si256( (__m256i*) & ptr[x+ptrStride], xtmp1 );
829
830
        xinp = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mInputPivotLo, xidx2 ), _mm256_shuffle_epi8( mInputPivotHi, xidx2 ) );
831
        xscl = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mScaleCoeffLo, xidx2 ), _mm256_shuffle_epi8( mScaleCoeffHi, xidx2 ) );
832
833
        xtmp1 = _mm256_slli_epi16( diff2, 4 );
834
        xtmp1 = _mm256_mulhrs_epi16( xtmp1, xscl );
835
836
        xtmp1 = _mm256_add_epi16( xinp, xtmp1 );
837
838
        xtmp1 = _mm256_min_epi16( xtmp1, mMax );
839
        xtmp1 = _mm256_max_epi16( xtmp1, mMin );
840
841
        _mm256_store_si256( (__m256i*) & ptr[x+2*ptrStride], xtmp1 );
842
843
        xinp = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mInputPivotLo, xidx3 ), _mm256_shuffle_epi8( mInputPivotHi, xidx3 ) );
844
        xscl = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mScaleCoeffLo, xidx3 ), _mm256_shuffle_epi8( mScaleCoeffHi, xidx3 ) );
845
846
        xtmp1 = _mm256_slli_epi16( diff3, 4 );
847
        xtmp1 = _mm256_mulhrs_epi16( xtmp1, xscl );
848
849
        xtmp1 = _mm256_add_epi16( xinp, xtmp1 );
850
851
        xtmp1 = _mm256_min_epi16( xtmp1, mMax );
852
        xtmp1 = _mm256_max_epi16( xtmp1, mMin );
853
854
        _mm256_store_si256( (__m256i*) & ptr[x+3*ptrStride], xtmp1 );
855
      }
856
857
      ptr += ( ptrStride << 2 );
858
    }
859
860
    _mm256_zeroupper();
861
  }
862
  else
863
#endif
864
0
  if( ( width & 7 ) == 0 )
865
0
  {
866
0
    __m128i xtmp1, xtmp2;
867
868
0
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[0] );
869
0
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[8] );
870
0
    xtmp1 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
871
0
    xtmp2 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
872
873
0
    const __m128i mInputPivotLo = _mm_unpacklo_epi64( xtmp1, xtmp2 );
874
0
    const __m128i mInputPivotHi = _mm_unpackhi_epi64( xtmp1, xtmp2 );
875
876
0
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &InvScCoeff[0] );
877
0
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &InvScCoeff[8] );
878
0
    xtmp1 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
879
0
    xtmp2 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
880
881
0
    const __m128i mScaleCoeffLo = _mm_unpacklo_epi64( xtmp1, xtmp2 );
882
0
    const __m128i mScaleCoeffHi = _mm_unpackhi_epi64( xtmp1, xtmp2 );
883
884
0
    const __m128i mMin    = _mm_setzero_si128();
885
0
    const __m128i mMax    = _mm_set1_epi16( ( 1 << bd ) - 1 );
886
887
0
    for( int y = 0; y < height; y += 4 )
888
0
    {
889
0
      _mm_prefetch( ( const char* ) ( ptr + 4 * ptrStride ), _MM_HINT_T0 );
890
0
      _mm_prefetch( ( const char* ) ( ptr + 5 * ptrStride ), _MM_HINT_T0 );
891
0
      _mm_prefetch( ( const char* ) ( ptr + 6 * ptrStride ), _MM_HINT_T0 );
892
0
      _mm_prefetch( ( const char* ) ( ptr + 7 * ptrStride ), _MM_HINT_T0 );
893
894
0
      for( int x = 0; x < width; x += 8 )
895
0
      {
896
0
        const __m128i xsrc0 = _mm_load_si128( ( const __m128i* ) ( ptr + x + 0 * ptrStride ) );
897
0
        const __m128i xsrc1 = _mm_load_si128( ( const __m128i* ) ( ptr + x + 1 * ptrStride ) );
898
0
        const __m128i xsrc2 = _mm_load_si128( ( const __m128i* ) ( ptr + x + 2 * ptrStride ) );
899
0
        const __m128i xsrc3 = _mm_load_si128( ( const __m128i* ) ( ptr + x + 3 * ptrStride ) );
900
901
0
        __m128i diff0, diff1, diff2, diff3;
902
0
        diff0   = _mm_minpos_epu16( _mm_min_epi16( _mm_min_epi16( xsrc0, xsrc1 ), _mm_min_epi16( xsrc2, xsrc3 ) ) );
903
0
        int min = _mm_extract_epi16( diff0, 0 );
904
905
0
        int i = minBin;
906
0
        switch( effMaxBin - minBin )
907
0
        {
908
0
        default:
909
0
        case 15: if( min < LmcsPivot[++i] ) { break; };
910
0
        case 14: if( min < LmcsPivot[++i] ) { break; };
911
0
        case 13: if( min < LmcsPivot[++i] ) { break; };
912
0
        case 12: if( min < LmcsPivot[++i] ) { break; };
913
0
        case 11: if( min < LmcsPivot[++i] ) { break; };
914
0
        case 10: if( min < LmcsPivot[++i] ) { break; };
915
0
        case  9: if( min < LmcsPivot[++i] ) { break; };
916
0
        case  8: if( min < LmcsPivot[++i] ) { break; };
917
0
        case  7: if( min < LmcsPivot[++i] ) { break; };
918
0
        case  6: if( min < LmcsPivot[++i] ) { break; };
919
0
        case  5: if( min < LmcsPivot[++i] ) { break; };
920
0
        case  4: if( min < LmcsPivot[++i] ) { break; };
921
0
        case  3: if( min < LmcsPivot[++i] ) { break; };
922
0
        case  2: if( min < LmcsPivot[++i] ) { break; };
923
0
        case  1: if( min < LmcsPivot[++i] ) { break; };
924
0
        case  0: if( min < LmcsPivot[++i] ) { break; };
925
0
        }
926
927
0
        --i;
928
929
0
        __m128i xidx0 = _mm_set1_epi16( i );
930
0
        __m128i xidx1 = xidx0;
931
0
        __m128i xidx2 = xidx0;
932
0
        __m128i xidx3 = xidx0;
933
934
0
        __m128i xlmcs = _mm_set1_epi16( LmcsPivot[i] );
935
936
0
        diff0 = _mm_sub_epi16( xsrc0, xlmcs );
937
0
        diff1 = _mm_sub_epi16( xsrc1, xlmcs );
938
0
        diff2 = _mm_sub_epi16( xsrc2, xlmcs );
939
0
        diff3 = _mm_sub_epi16( xsrc3, xlmcs );
940
941
0
        for( ++i; i <= effMaxBin; ++i )
942
0
        {
943
0
          xlmcs         = _mm_set1_epi16( LmcsPivot[i] );
944
945
0
          __m128i currd = _mm_sub_epi16( xsrc0, xlmcs );
946
0
          diff0         = _mm_min_epu16( diff0, currd );
947
0
          __m128i chnd0 = _mm_cmpeq_epi16( currd, diff0 );
948
949
0
          currd         = _mm_sub_epi16( xsrc1, xlmcs );
950
0
          diff1         = _mm_min_epu16( diff1, currd );
951
0
          __m128i chnd1 = _mm_cmpeq_epi16( currd, diff1 );
952
953
0
          currd         = _mm_sub_epi16( xsrc2, xlmcs );
954
0
          diff2         = _mm_min_epu16( diff2, currd );
955
0
          __m128i chnd2 = _mm_cmpeq_epi16( currd, diff2 );
956
957
0
          currd         = _mm_sub_epi16( xsrc3, xlmcs );
958
0
          diff3         = _mm_min_epu16( diff3, currd );
959
0
          __m128i chnd3 = _mm_cmpeq_epi16( currd, diff3 );
960
961
0
          xidx0         = _mm_sub_epi16( xidx0, chnd0 );
962
0
          xidx1         = _mm_sub_epi16( xidx1, chnd1 );
963
0
          xidx2         = _mm_sub_epi16( xidx2, chnd2 );
964
0
          xidx3         = _mm_sub_epi16( xidx3, chnd3 );
965
966
0
          chnd0         = _mm_or_si128( chnd0, chnd1 );
967
0
          chnd2         = _mm_or_si128( chnd2, chnd3 );
968
0
          chnd0         = _mm_or_si128( chnd0, chnd2 );
969
970
0
          if( _mm_movemask_epi8( chnd0 ) == 0 ) break;
971
0
        }
972
973
0
        xidx0 = _mm_packs_epi16( xidx0, _mm_set1_epi8( -1 ) );
974
0
        xidx1 = _mm_packs_epi16( xidx1, _mm_set1_epi8( -1 ) );
975
0
        xidx2 = _mm_packs_epi16( xidx2, _mm_set1_epi8( -1 ) );
976
0
        xidx3 = _mm_packs_epi16( xidx3, _mm_set1_epi8( -1 ) );
977
978
0
        __m128i xinp = _mm_unpacklo_epi8( _mm_shuffle_epi8( mInputPivotLo, xidx0 ), _mm_shuffle_epi8( mInputPivotHi, xidx0 ) );
979
0
        __m128i xscl = _mm_unpacklo_epi8( _mm_shuffle_epi8( mScaleCoeffLo, xidx0 ), _mm_shuffle_epi8( mScaleCoeffHi, xidx0 ) );
980
981
0
        xtmp1 = _mm_slli_epi16( diff0, 4 );
982
0
        xtmp1 = _mm_mulhrs_epi16( xtmp1, xscl );
983
984
0
        xtmp1 = _mm_add_epi16( xinp, xtmp1 );
985
986
0
        xtmp1 = _mm_min_epi16( xtmp1, mMax );
987
0
        xtmp1 = _mm_max_epi16( xtmp1, mMin );
988
989
0
        _mm_store_si128( ( __m128i * ) &ptr[x], xtmp1 );
990
991
0
        xinp = _mm_unpacklo_epi8( _mm_shuffle_epi8( mInputPivotLo, xidx1 ), _mm_shuffle_epi8( mInputPivotHi, xidx1 ) );
992
0
        xscl = _mm_unpacklo_epi8( _mm_shuffle_epi8( mScaleCoeffLo, xidx1 ), _mm_shuffle_epi8( mScaleCoeffHi, xidx1 ) );
993
994
0
        xtmp1 = _mm_slli_epi16( diff1, 4 );
995
0
        xtmp1 = _mm_mulhrs_epi16( xtmp1, xscl );
996
997
0
        xtmp1 = _mm_add_epi16( xinp, xtmp1 );
998
999
0
        xtmp1 = _mm_min_epi16( xtmp1, mMax );
1000
0
        xtmp1 = _mm_max_epi16( xtmp1, mMin );
1001
1002
0
        _mm_store_si128( (__m128i*) & ptr[x+ptrStride], xtmp1 );
1003
1004
0
        xinp = _mm_unpacklo_epi8( _mm_shuffle_epi8( mInputPivotLo, xidx2 ), _mm_shuffle_epi8( mInputPivotHi, xidx2 ) );
1005
0
        xscl = _mm_unpacklo_epi8( _mm_shuffle_epi8( mScaleCoeffLo, xidx2 ), _mm_shuffle_epi8( mScaleCoeffHi, xidx2 ) );
1006
1007
0
        xtmp1 = _mm_slli_epi16( diff2, 4 );
1008
0
        xtmp1 = _mm_mulhrs_epi16( xtmp1, xscl );
1009
1010
0
        xtmp1 = _mm_add_epi16( xinp, xtmp1 );
1011
1012
0
        xtmp1 = _mm_min_epi16( xtmp1, mMax );
1013
0
        xtmp1 = _mm_max_epi16( xtmp1, mMin );
1014
1015
0
        _mm_store_si128( (__m128i*) & ptr[x + 2 * ptrStride], xtmp1 );
1016
1017
0
        xinp = _mm_unpacklo_epi8( _mm_shuffle_epi8( mInputPivotLo, xidx3 ), _mm_shuffle_epi8( mInputPivotHi, xidx3 ) );
1018
0
        xscl = _mm_unpacklo_epi8( _mm_shuffle_epi8( mScaleCoeffLo, xidx3 ), _mm_shuffle_epi8( mScaleCoeffHi, xidx3 ) );
1019
1020
0
        xtmp1 = _mm_slli_epi16( diff3, 4 );
1021
0
        xtmp1 = _mm_mulhrs_epi16( xtmp1, xscl );
1022
1023
0
        xtmp1 = _mm_add_epi16( xinp, xtmp1 );
1024
1025
0
        xtmp1 = _mm_min_epi16( xtmp1, mMax );
1026
0
        xtmp1 = _mm_max_epi16( xtmp1, mMin );
1027
1028
0
        _mm_store_si128( (__m128i*) & ptr[x + 3 * ptrStride], xtmp1 );
1029
0
      }
1030
1031
0
      ptr += ptrStride << 2;
1032
0
    }
1033
0
  }
1034
0
  else
1035
0
  {
1036
0
    THROW_FATAL( "Unsupported size!" );
1037
0
  }
1038
0
}
Unexecuted instantiation: void vvdec::rspBcwCore_SIMD<(vvdec::x86_simd::X86_VEXT)1>(short*, long, int, int, int, int, int, short const*, short const*, short const*)
Unexecuted instantiation: void vvdec::rspBcwCore_SIMD<(vvdec::x86_simd::X86_VEXT)4>(short*, long, int, int, int, int, int, short const*, short const*, short const*)
1039
1040
template<X86_VEXT vext>
1041
void rspFwdCore_SIMD( Pel* ptr, ptrdiff_t ptrStride, int width, int height, const int bd, const Pel OrgCW, const Pel* LmcsPivot, const Pel* ScaleCoeff, const Pel* InputPivot )
1042
0
{
1043
0
  _mm_prefetch( ( const char* ) (ptr + 0 * ptrStride), _MM_HINT_T0 );
1044
0
  _mm_prefetch( ( const char* ) (ptr + 1 * ptrStride), _MM_HINT_T0 );
1045
1046
0
  int shift = getLog2( OrgCW );
1047
1048
#if USE_AVX2
1049
0
  if( ( width & 15 ) == 0 )
1050
0
  {
1051
0
    __m128i xtmp1, xtmp2, xtmp3, xtmp4;
1052
0
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &LmcsPivot[0] );
1053
0
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &LmcsPivot[8] );
1054
0
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1055
0
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1056
0
    xtmp1 = _mm_unpacklo_epi64( xtmp3, xtmp4 );
1057
0
    xtmp2 = _mm_unpackhi_epi64( xtmp3, xtmp4 );
1058
1059
    const __m256i mLmcsPivotLo = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp1 ), xtmp1, 1 );
1060
    const __m256i mLmcsPivotHi = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp2 ), xtmp2, 1 );
1061
1062
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[0] );
1063
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[8] );
1064
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1065
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1066
    xtmp1 = _mm_unpacklo_epi64( xtmp3, xtmp4 );
1067
    xtmp2 = _mm_unpackhi_epi64( xtmp3, xtmp4 );
1068
1069
    const __m256i mInputPivotLo = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp1 ), xtmp1, 1 );
1070
    const __m256i mInputPivotHi = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp2 ), xtmp2, 1 );
1071
1072
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &ScaleCoeff[0] );
1073
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &ScaleCoeff[8] );
1074
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1075
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1076
    xtmp1 = _mm_unpacklo_epi64( xtmp3, xtmp4 );
1077
    xtmp2 = _mm_unpackhi_epi64( xtmp3, xtmp4 );
1078
1079
    const __m256i mScaleCoeffLo = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp1 ), xtmp1, 1 );
1080
    const __m256i mScaleCoeffHi = _mm256_inserti128_si256( _mm256_castsi128_si256( xtmp2 ), xtmp2, 1 );
1081
1082
    const __m256i mMin    = _mm256_setzero_si256();
1083
    const __m256i mMax    = _mm256_set1_epi16( ( 1 << bd ) - 1 );
1084
1085
    //#define RSP_FWD_OP( ADDR ) { idxY = ( ptr[ADDR] >> shift ); ptr[ADDR] = static_cast<Pel>( ClipBD<int>( LmcsPivot[idxY] + ( ( ScaleCoeff[idxY] * ( ptr[ADDR] - InputPivot[idxY] ) + ( 1 << 10 ) ) >> 11 ), bd ) ); }
1086
1087
0
    while( height-- )
1088
0
    {
1089
0
      _mm_prefetch( ( const char* ) ( ptr + ptrStride ), _MM_HINT_T0 );
1090
1091
0
      for( int x = 0; x < width; x += 16 )
1092
0
      {
1093
0
        const __m256i xsrc = _mm256_loadu_si256( ( const __m256i* ) &ptr[x] );
1094
0
        const __m256i xidx = _mm256_packs_epi16( _mm256_srai_epi16 ( xsrc, shift ), _mm256_set1_epi8( -1 ) );
1095
1096
0
        const __m256i xinp = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mInputPivotLo, xidx ), _mm256_shuffle_epi8( mInputPivotHi, xidx ) );
1097
0
        const __m256i xscl = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mScaleCoeffLo, xidx ), _mm256_shuffle_epi8( mScaleCoeffHi, xidx ) );
1098
0
        const __m256i xlmc = _mm256_unpacklo_epi8( _mm256_shuffle_epi8( mLmcsPivotLo,  xidx ), _mm256_shuffle_epi8( mLmcsPivotHi,  xidx ) );
1099
1100
0
        __m256i
1101
0
        vtmp1 = _mm256_slli_epi16( _mm256_subs_epi16( xsrc, xinp ), 4 );
1102
0
        vtmp1 = _mm256_mulhrs_epi16( vtmp1, xscl );
1103
1104
0
        vtmp1 = _mm256_add_epi16( xlmc, vtmp1 );
1105
1106
0
        vtmp1 = _mm256_min_epi16( vtmp1, mMax );
1107
0
        vtmp1 = _mm256_max_epi16( vtmp1, mMin );
1108
1109
0
        _mm256_storeu_si256( ( __m256i * ) &ptr[x], vtmp1 );
1110
0
      }
1111
1112
0
      ptr += ptrStride;
1113
0
    }
1114
1115
0
    _mm256_zeroupper();
1116
0
  }
1117
0
  else
1118
0
#endif
1119
0
  if( ( width & 7 ) == 0 )
1120
0
  {
1121
0
    __m128i xtmp1, xtmp2, xtmp3, xtmp4;
1122
0
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &LmcsPivot[0] );
1123
0
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &LmcsPivot[8] );
1124
0
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1125
0
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1126
1127
0
    const __m128i mLmcsPivotLo = _mm_unpacklo_epi64( xtmp3, xtmp4 );
1128
0
    const __m128i mLmcsPivotHi = _mm_unpackhi_epi64( xtmp3, xtmp4 );
1129
1130
0
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[0] );
1131
0
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &InputPivot[8] );
1132
0
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1133
0
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1134
1135
0
    const __m128i mInputPivotLo = _mm_unpacklo_epi64( xtmp3, xtmp4 );
1136
0
    const __m128i mInputPivotHi = _mm_unpackhi_epi64( xtmp3, xtmp4 );
1137
1138
1139
0
    xtmp1 = _mm_loadu_si128( ( const __m128i* ) &ScaleCoeff[0] );
1140
0
    xtmp2 = _mm_loadu_si128( ( const __m128i* ) &ScaleCoeff[8] );
1141
0
    xtmp3 = _mm_shuffle_epi8( xtmp1, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1142
0
    xtmp4 = _mm_shuffle_epi8( xtmp2, _mm_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 ) );
1143
1144
0
    const __m128i mScaleCoeffLo = _mm_unpacklo_epi64( xtmp3, xtmp4 );
1145
0
    const __m128i mScaleCoeffHi = _mm_unpackhi_epi64( xtmp3, xtmp4 );
1146
1147
0
    const __m128i mMin    = _mm_setzero_si128();
1148
0
    const __m128i mMax    = _mm_set1_epi16( ( 1 << bd ) - 1 );
1149
1150
0
    while( height-- )
1151
0
    {
1152
0
      _mm_prefetch( ( const char* ) ( ptr + ptrStride ), _MM_HINT_T0 );
1153
1154
0
      for( int x = 0; x < width; x += 8 )
1155
0
      {
1156
0
        const __m128i xsrc = _mm_loadu_si128( ( const __m128i* ) &ptr[x] );
1157
0
        const __m128i xidx = _mm_packs_epi16( _mm_srai_epi16 ( xsrc, shift ), _mm_set1_epi8( -1 ) );
1158
1159
0
        const __m128i xlmc = _mm_unpacklo_epi8( _mm_shuffle_epi8( mLmcsPivotLo,  xidx ), _mm_shuffle_epi8( mLmcsPivotHi,  xidx ) );
1160
0
        const __m128i xinp = _mm_unpacklo_epi8( _mm_shuffle_epi8( mInputPivotLo, xidx ), _mm_shuffle_epi8( mInputPivotHi, xidx ) );
1161
0
        const __m128i xscl = _mm_unpacklo_epi8( _mm_shuffle_epi8( mScaleCoeffLo, xidx ), _mm_shuffle_epi8( mScaleCoeffHi, xidx ) );
1162
1163
0
        xtmp1 = _mm_slli_epi16( _mm_subs_epi16( xsrc, xinp ), 4 );
1164
0
        xtmp1 = _mm_mulhrs_epi16( xtmp1, xscl );
1165
1166
0
        xtmp1 = _mm_add_epi16( xlmc, xtmp1 );
1167
1168
0
        xtmp1 = _mm_min_epi16( xtmp1, mMax );
1169
0
        xtmp1 = _mm_max_epi16( xtmp1, mMin );
1170
1171
0
        _mm_storeu_si128( ( __m128i * ) &ptr[x], xtmp1 );
1172
0
      }
1173
1174
0
      ptr += ptrStride;
1175
0
    }
1176
0
  }
1177
0
  else
1178
0
  {
1179
0
    int idxY;
1180
1181
    //    const auto rsp_sgnl_op  = [=, &dst]( int ADDR ){ idxY = ( dst[ADDR] >> shift ); dst[ADDR] = static_cast<Pel>( ClipBD<int>( LmcsPivot[idxY] + ( ( ScaleCoeff[idxY] * ( dst[ADDR] - InputPivot[idxY] ) + ( 1 << 10 ) ) >> 11 ), bd ) ); };
1182
    //    const auto rsp_sgnl_inc = [=, &dst]            { dst += stride; };
1183
1184
    //    size_aware_pel_op( rsp_sgnl_op, rsp_sgnl_inc, width, height );
1185
1186
0
#define RSP_FWD_OP( ADDR ) { idxY = ( ptr[ADDR] >> shift ); ptr[ADDR] = static_cast<Pel>( ClipBD<int>( LmcsPivot[idxY] + ( ( ScaleCoeff[idxY] * ( ptr[ADDR] - InputPivot[idxY] ) + ( 1 << 10 ) ) >> 11 ), bd ) ); }
1187
0
#define RSP_FWD_INC        ptr      += ptrStride;
1188
1189
0
    SIZE_AWARE_PER_EL_OP( RSP_FWD_OP, RSP_FWD_INC )
1190
1191
0
#undef RSP_FWD_OP
1192
0
#undef RSP_FWD_INC
1193
0
  }
1194
0
}
Unexecuted instantiation: void vvdec::rspFwdCore_SIMD<(vvdec::x86_simd::X86_VEXT)1>(short*, long, int, int, int, short, short const*, short const*, short const*)
Unexecuted instantiation: void vvdec::rspFwdCore_SIMD<(vvdec::x86_simd::X86_VEXT)4>(short*, long, int, int, int, short, short const*, short const*, short const*)
1195
1196
#if INTPTR_MAX == INT64_MAX
1197
template<X86_VEXT vext>
1198
void fillN_CU_SIMD( CodingUnit** ptr, ptrdiff_t ptrStride, int width, int height, CodingUnit* cuPtr )
1199
0
{
1200
0
  static_assert( sizeof( cuPtr ) == 8, "Only supported for 64bit systems!" );
1201
0
  if( ( width & 3 ) == 0 )
1202
0
  {
1203
#if USE_AVX2
1204
    __m256i vval = _mm256_set1_epi64x( ( int64_t ) cuPtr );
1205
1206
0
    while( height-- )
1207
0
    {
1208
0
      for( int x = 0; x < width; x += 4 ) _mm256_storeu_si256( ( __m256i* ) &ptr[x], vval );
1209
1210
0
      ptr += ptrStride;
1211
0
    }
1212
#else
1213
    __m128i vval = _mm_set1_epi64x( ( int64_t ) cuPtr );
1214
1215
0
    while( height-- )
1216
0
    {
1217
0
      for( int x = 0; x < width; x += 4 )
1218
0
      {
1219
0
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 0], vval );
1220
0
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 2], vval );
1221
0
      }
1222
1223
0
      ptr += ptrStride;
1224
0
    }
1225
#endif
1226
0
  }
1227
0
  else if( ( width & 1 ) == 0 )
1228
0
  {
1229
0
    __m128i vval = _mm_set1_epi64x( ( int64_t ) cuPtr );
1230
1231
0
    while( height-- )
1232
0
    {
1233
0
      for( int x = 0; x < width; x += 2 ) _mm_storeu_si128( ( __m128i* ) &ptr[x], vval );
1234
1235
0
      ptr += ptrStride;
1236
0
    }
1237
0
  }
1238
0
  else
1239
0
  {
1240
0
    while( height-- )
1241
0
    {
1242
0
      *ptr = cuPtr; ptr += ptrStride;
1243
0
    }
1244
0
  }
1245
0
}
Unexecuted instantiation: void vvdec::fillN_CU_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::CodingUnit**, long, int, int, vvdec::CodingUnit*)
Unexecuted instantiation: void vvdec::fillN_CU_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::CodingUnit**, long, int, int, vvdec::CodingUnit*)
1246
#elif INTPTR_MAX == INT32_MAX
1247
template<X86_VEXT vext>
1248
void fillN_CU_SIMD( CodingUnit** ptr, ptrdiff_t ptrStride, int width, int height, CodingUnit* cuPtr )
1249
{
1250
  static_assert( sizeof( cuPtr ) == 4, "Only supported for 32bit systems!" );
1251
  if( ( width & 7 ) == 0 )
1252
  {
1253
#if USE_AVX2
1254
    __m256i vval = _mm256_set1_epi32( ( int32_t ) cuPtr );
1255
1256
    while( height-- )
1257
    {
1258
      for( int x = 0; x < width; x += 8 )
1259
      {
1260
        _mm256_storeu_si256( (__m256i*) &ptr[x], vval );
1261
      }
1262
1263
      ptr += ptrStride;
1264
    }
1265
#else
1266
    __m128i vval = _mm_set1_epi32( ( int32_t ) cuPtr );
1267
1268
    while( height-- )
1269
    {
1270
      for( int x = 0; x < width; x += 8 )
1271
      {
1272
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 0], vval );
1273
        _mm_storeu_si128( ( __m128i* ) &ptr[x + 4], vval );
1274
      }
1275
1276
      ptr += ptrStride;
1277
    }
1278
#endif
1279
  }
1280
  else if( ( width & 3 ) == 0 )
1281
  {
1282
    __m128i vval = _mm_set1_epi32( ( int32_t ) cuPtr );
1283
1284
    while( height-- )
1285
    {
1286
      for( int x = 0; x < width; x += 4 )
1287
      {
1288
        _mm_storeu_si128( (__m128i*) &ptr[x], vval );
1289
      }
1290
1291
      ptr += ptrStride;
1292
    }
1293
  }
1294
  else if( ( width & 1 ) == 0 )
1295
  {
1296
    while( height-- )
1297
    {
1298
      ptr[0] = cuPtr;
1299
      ptr[1] = cuPtr;
1300
1301
      ptr += ptrStride;
1302
    }
1303
  }
1304
  else
1305
  {
1306
    while( height-- )
1307
    {
1308
      for( int x = 0; x < width; ++x )
1309
      {
1310
        ptr[x] = cuPtr;
1311
      }
1312
      ptr += ptrStride;
1313
    }
1314
  }
1315
}
1316
#endif  // INTPTR_MAX == INT32_MAX
1317
1318
template<X86_VEXT vext>
1319
void sampleRateConvSIMD_8tap( const std::pair<int, int> scalingRatio,
1320
                              const std::pair<int, int> compScale,
1321
                              const Pel*                orgSrc,
1322
                              const ptrdiff_t           orgStride,
1323
                              const int                 orgWidth,
1324
                              const int                 orgHeight,
1325
                              const int                 beforeScaleLeftOffset,
1326
                              const int                 beforeScaleTopOffset,
1327
                              Pel*                      scaledSrc,
1328
                              const ptrdiff_t           scaledStride,
1329
                              const int                 scaledWidth,
1330
                              const int                 scaledHeight,
1331
                              const int                 afterScaleLeftOffset,
1332
                              const int                 afterScaleTopOffset,
1333
                              const int                 bitDepth )
1334
0
{
1335
0
  static constexpr bool useLumaFilter = true;
1336
0
  static constexpr int horCollocatedPositionFlag = 1;
1337
0
  static constexpr int verCollocatedPositionFlag = 1;
1338
1339
0
  const TFilterCoeff* filterHor = useLumaFilter ? &InterpolationFilter::m_lumaFilter[0][0] : &InterpolationFilter::m_chromaFilter[0][0];
1340
0
  const TFilterCoeff* filterVer = useLumaFilter ? &InterpolationFilter::m_lumaFilter[0][0] : &InterpolationFilter::m_chromaFilter[0][0];
1341
1342
0
  const int numFracPositions  = useLumaFilter ? 15 : 31;
1343
0
  const int numFracShift      = useLumaFilter ? 4 : 5;
1344
0
  const int posShiftX = SCALE_RATIO_BITS - numFracShift + compScale.first;
1345
0
  const int posShiftY = SCALE_RATIO_BITS - numFracShift + compScale.second;
1346
0
  int addX = (1 << (posShiftX - 1)) + (beforeScaleLeftOffset << SCALE_RATIO_BITS) + ((int( 1 - horCollocatedPositionFlag ) * 8 * (scalingRatio.first - SCALE_1X.first) + (1 << (2 + compScale.first))) >> (3 + compScale.first));
1347
0
  int addY = (1 << (posShiftY - 1)) + (beforeScaleTopOffset << SCALE_RATIO_BITS) + ((int( 1 - verCollocatedPositionFlag ) * 8 * (scalingRatio.second - SCALE_1X.second) + (1 << (2 + compScale.second))) >> (3 + compScale.second));
1348
1349
0
  const int filterLength = useLumaFilter ? NTAPS_LUMA : NTAPS_CHROMA;
1350
0
  const int log2Norm = 12;
1351
1352
0
  CHECK( bitDepth > 17, "Overflow may happen!" );
1353
0
  const int maxVal = (1 << bitDepth) - 1;
1354
1355
0
  const int tmpStride = ( ( scaledWidth + 3 ) / 4 ) * 4;
1356
0
  const int tmpHeight = ( ( orgHeight + 3 ) / 4 ) * 4;
1357
0
  int*      tmpBuf    = new int[tmpStride * tmpHeight];
1358
1359
0
  for( int j = 0; j < orgHeight; j += 4 )
1360
0
  {
1361
0
    const Pel* org0 = orgSrc +                                j * orgStride;
1362
0
    const Pel* org1 = orgSrc + std::min( j + 1, orgHeight - 1 ) * orgStride;
1363
0
    const Pel* org2 = orgSrc + std::min( j + 2, orgHeight - 1 ) * orgStride;
1364
0
    const Pel* org3 = orgSrc + std::min( j + 3, orgHeight - 1 ) * orgStride;
1365
1366
0
    _mm_prefetch( ( const char* ) (org0 + (orgStride << 2)), _MM_HINT_T0 );
1367
0
    _mm_prefetch( ( const char* ) (org1 + (orgStride << 2)), _MM_HINT_T0 );
1368
0
    _mm_prefetch( ( const char* ) (org2 + (orgStride << 2)), _MM_HINT_T0 );
1369
0
    _mm_prefetch( ( const char* ) (org3 + (orgStride << 2)), _MM_HINT_T0 );
1370
1371
0
    for( int i = 0; i < scaledWidth; i++ )
1372
0
    {
1373
0
      int refPos  = ( ( ( i << compScale.first ) - afterScaleLeftOffset ) * scalingRatio.first + addX ) >> posShiftX;
1374
0
      int integer = refPos >> numFracShift;
1375
0
      int frac    = refPos  & numFracPositions;
1376
1377
0
      const TFilterCoeff* f = filterHor + frac * filterLength;
1378
1379
0
      __m128i vsrc0, vsrc1, vsrc2, vsrc3;
1380
1381
0
      if( integer + 0 - ( filterLength / 2 ) + 1 >= 0 && integer + ( NTAPS_LUMA - 1 ) - ( filterLength / 2 ) + 1 < orgWidth )
1382
0
      {
1383
0
        int xInt = integer + 0 - ( filterLength / 2 ) + 1;
1384
1385
0
        vsrc0 = _mm_loadu_si128( (const __m128i*) &org0[xInt] );
1386
0
        vsrc1 = _mm_loadu_si128( (const __m128i*) &org1[xInt] );
1387
0
        vsrc2 = _mm_loadu_si128( (const __m128i*) &org2[xInt] );
1388
0
        vsrc3 = _mm_loadu_si128( (const __m128i*) &org3[xInt] );
1389
0
      }
1390
0
      else
1391
0
      {
1392
0
        Pel src[4][NTAPS_LUMA];
1393
1394
0
        for( int k = 0; k < filterLength; k++ )
1395
0
        {
1396
0
          int xInt = std::min<int>( std::max( 0, integer + k - filterLength / 2 + 1 ), orgWidth - 1 );
1397
1398
0
          src[0][k] = org0[xInt];
1399
0
          src[1][k] = org1[xInt];
1400
0
          src[2][k] = org2[xInt];
1401
0
          src[3][k] = org3[xInt];
1402
0
        }
1403
1404
0
        vsrc0 = _mm_loadu_si128( (const __m128i*) &src[0][0] );
1405
0
        vsrc1 = _mm_loadu_si128( (const __m128i*) &src[1][0] );
1406
0
        vsrc2 = _mm_loadu_si128( (const __m128i*) &src[2][0] );
1407
0
        vsrc3 = _mm_loadu_si128( (const __m128i*) &src[3][0] );
1408
0
      }
1409
1410
0
      __m128i vflt  = _mm_loadu_si128( (const __m128i*)  f );
1411
1412
0
      __m128i vres0 = _mm_madd_epi16( vsrc0, vflt );
1413
0
      __m128i vres1 = _mm_madd_epi16( vsrc1, vflt );
1414
0
      __m128i vres2 = _mm_madd_epi16( vsrc2, vflt );
1415
0
      __m128i vres3 = _mm_madd_epi16( vsrc3, vflt );
1416
1417
0
      vres0 = _mm_hadd_epi32( vres0, vres1 );
1418
0
      vres2 = _mm_hadd_epi32( vres2, vres3 );
1419
1420
0
      vres0 = _mm_hadd_epi32( vres0, vres2 );
1421
1422
0
      int* tmp = tmpBuf + i * tmpHeight + j;
1423
1424
0
      _mm_storeu_si128( (__m128i*) tmp, vres0 );
1425
0
    }
1426
0
  }
1427
1428
0
  __m128i vzero = _mm_setzero_si128();
1429
0
  __m128i vnorm = _mm_set1_epi32( ( 1 << ( log2Norm - 1 ) ) );
1430
1431
0
  for( int i = 0; i < scaledWidth; i += 4 )
1432
0
  {
1433
0
    Pel* dst = scaledSrc;
1434
1435
0
    int* tmp0 =                       tmpBuf + i       * tmpHeight;
1436
0
    int* tmp1 = i + 1 < scaledWidth ? tmpBuf + (i + 1) * tmpHeight : tmp0;
1437
0
    int* tmp2 = i + 2 < scaledWidth ? tmpBuf + (i + 2) * tmpHeight : tmp0;
1438
0
    int* tmp3 = i + 3 < scaledWidth ? tmpBuf + (i + 3) * tmpHeight : tmp0;
1439
1440
0
    _mm_prefetch( ( const char* ) (tmp0 + (tmpHeight << 2)), _MM_HINT_T0 );
1441
0
    _mm_prefetch( ( const char* ) (tmp1 + (tmpHeight << 2)), _MM_HINT_T0 );
1442
0
    _mm_prefetch( ( const char* ) (tmp2 + (tmpHeight << 2)), _MM_HINT_T0 );
1443
0
    _mm_prefetch( ( const char* ) (tmp3 + (tmpHeight << 2)), _MM_HINT_T0 );
1444
1445
0
    for( int j = 0; j < scaledHeight; j++ )
1446
0
    {
1447
0
      const int refPos      = ( ( ( j << compScale.second ) - afterScaleTopOffset ) * scalingRatio.second + addY ) >> posShiftY;
1448
0
      const int integer     = refPos >> numFracShift;
1449
0
      const int frac        = refPos & numFracPositions;
1450
0
      const TFilterCoeff* f = filterVer + frac * filterLength;
1451
0
      __m128i vres0, vres1, vres2, vres3;
1452
1453
#if USE_AVX2
1454
0
      if( vext >= AVX2 )
1455
0
      {
1456
0
        __m256i vflt = _mm256_cvtepi16_epi32( _mm_loadu_si128( (const __m128i*)  f ) );
1457
1458
        __m256i vsrc0, vsrc1, vsrc2, vsrc3;
1459
1460
0
        if( integer + 0 - (filterLength / 2) + 1 >= 0 && integer + (NTAPS_LUMA - 1) - (filterLength / 2) + 1 < orgHeight )
1461
0
        {
1462
0
          int yInt = integer + 0 - (filterLength / 2) + 1;
1463
1464
0
          vsrc0 = _mm256_loadu_si256( (const __m256i*) &tmp0[yInt] );
1465
0
          vsrc1 = _mm256_loadu_si256( (const __m256i*) &tmp1[yInt] );
1466
0
          vsrc2 = _mm256_loadu_si256( (const __m256i*) &tmp2[yInt] );
1467
0
          vsrc3 = _mm256_loadu_si256( (const __m256i*) &tmp3[yInt] );
1468
0
        }
1469
0
        else
1470
0
        {
1471
0
          int src[4][NTAPS_LUMA];
1472
1473
0
          for( int k = 0; k < filterLength; k++ )
1474
0
          {
1475
0
            int yInt = std::min<int>( std::max( 0, integer + k - filterLength / 2 + 1 ), orgHeight - 1 );
1476
0
            src[0][k] = tmp0[yInt];
1477
0
            src[1][k] = tmp1[yInt];
1478
0
            src[2][k] = tmp2[yInt];
1479
0
            src[3][k] = tmp3[yInt];
1480
0
          }
1481
1482
0
          vsrc0 = _mm256_loadu_si256( (const __m256i*) &src[0][0] );
1483
0
          vsrc1 = _mm256_loadu_si256( (const __m256i*) &src[1][0] );
1484
0
          vsrc2 = _mm256_loadu_si256( (const __m256i*) &src[2][0] );
1485
0
          vsrc3 = _mm256_loadu_si256( (const __m256i*) &src[3][0] );
1486
0
        }
1487
1488
        __m256i xres0 = _mm256_mullo_epi32( vsrc0, vflt );
1489
        __m256i xres1 = _mm256_mullo_epi32( vsrc1, vflt );
1490
        __m256i xres2 = _mm256_mullo_epi32( vsrc2, vflt );
1491
        __m256i xres3 = _mm256_mullo_epi32( vsrc3, vflt );
1492
1493
        xres0 = _mm256_hadd_epi32( xres0, xres1 );
1494
        xres2 = _mm256_hadd_epi32( xres2, xres3 );
1495
1496
        xres0 = _mm256_hadd_epi32( xres0, xres2 );
1497
1498
0
        vres0 = _mm_add_epi32( _mm256_castsi256_si128( xres0 ), _mm256_extracti128_si256( xres0, 1 ) );
1499
0
      }
1500
0
      else
1501
0
#endif
1502
0
      {
1503
0
        __m128i vflt[2];
1504
0
        vflt[1] = _mm_loadu_si128( (const __m128i*)  f );
1505
0
        vflt[0] = _mm_cvtepi16_epi32( vflt[1] );
1506
0
        vflt[1] = _mm_cvtepi16_epi32( _mm_unpackhi_epi64( vflt[1], _mm_setzero_si128() ) );
1507
1508
0
        __m128i vsrc0[2], vsrc1[2], vsrc2[2], vsrc3[2];
1509
1510
0
        if( integer + 0 - ( filterLength / 2 ) + 1 >= 0 && integer + ( NTAPS_LUMA - 1 ) - ( filterLength / 2 ) + 1 < orgHeight )
1511
0
        {
1512
0
          int yInt = integer + 0 - ( filterLength / 2 ) + 1;
1513
1514
0
          vsrc0[0] = _mm_loadu_si128( (const __m128i*) &tmp0[yInt]);
1515
0
          vsrc0[1] = _mm_loadu_si128( (const __m128i*) &tmp0[yInt + 4]);
1516
1517
0
          vsrc1[0] = _mm_loadu_si128( (const __m128i*) &tmp1[yInt]);
1518
0
          vsrc1[1] = _mm_loadu_si128( (const __m128i*) &tmp1[yInt + 4]);
1519
1520
0
          vsrc2[0] = _mm_loadu_si128( (const __m128i*) &tmp2[yInt]);
1521
0
          vsrc2[1] = _mm_loadu_si128( (const __m128i*) &tmp2[yInt + 4]);
1522
1523
0
          vsrc3[0] = _mm_loadu_si128( (const __m128i*) &tmp3[yInt]);
1524
0
          vsrc3[1] = _mm_loadu_si128( (const __m128i*) &tmp3[yInt + 4]);
1525
0
        }
1526
0
        else
1527
0
        {
1528
0
          int src[4][NTAPS_LUMA];
1529
1530
0
          for( int k = 0; k < filterLength; k++ )
1531
0
          {
1532
0
            int yInt = std::min<int>( std::max( 0, integer + k - filterLength / 2 + 1 ), orgHeight - 1 );
1533
0
            src[0][k] = tmp0[yInt];
1534
0
            src[1][k] = tmp1[yInt];
1535
0
            src[2][k] = tmp2[yInt];
1536
0
            src[3][k] = tmp3[yInt];
1537
0
          }
1538
1539
0
          vsrc0[0] = _mm_loadu_si128( (const __m128i*) &src[0][0] );
1540
0
          vsrc0[1] = _mm_loadu_si128( (const __m128i*) &src[0][4] );
1541
1542
0
          vsrc1[0] = _mm_loadu_si128( (const __m128i*) &src[1][0] );
1543
0
          vsrc1[1] = _mm_loadu_si128( (const __m128i*) &src[1][4] );
1544
1545
0
          vsrc2[0] = _mm_loadu_si128( (const __m128i*) &src[2][0] );
1546
0
          vsrc2[1] = _mm_loadu_si128( (const __m128i*) &src[2][4] );
1547
1548
0
          vsrc3[0] = _mm_loadu_si128( (const __m128i*) &src[3][0] );
1549
0
          vsrc3[1] = _mm_loadu_si128( (const __m128i*) &src[3][4] );
1550
0
        }
1551
1552
0
        vres0 = _mm_add_epi32( _mm_mullo_epi32( vsrc0[0], vflt[0] ), _mm_mullo_epi32( vsrc0[1], vflt[1] ) );
1553
0
        vres1 = _mm_add_epi32( _mm_mullo_epi32( vsrc1[0], vflt[0] ), _mm_mullo_epi32( vsrc1[1], vflt[1] ) );
1554
0
        vres2 = _mm_add_epi32( _mm_mullo_epi32( vsrc2[0], vflt[0] ), _mm_mullo_epi32( vsrc2[1], vflt[1] ) );
1555
0
        vres3 = _mm_add_epi32( _mm_mullo_epi32( vsrc3[0], vflt[0] ), _mm_mullo_epi32( vsrc3[1], vflt[1] ) );
1556
1557
0
        vres0 = _mm_hadd_epi32( vres0, vres1 );
1558
0
        vres2 = _mm_hadd_epi32( vres2, vres3 );
1559
1560
0
        vres0 = _mm_hadd_epi32( vres0, vres2 );
1561
0
      }
1562
1563
0
      vres0 = _mm_add_epi32( vres0, vnorm );
1564
0
      vres0 = _mm_srai_epi32( vres0, log2Norm );
1565
0
      vres0 = _mm_max_epi32( _mm_min_epi32( _mm_set1_epi32( maxVal ), vres0 ), vzero );
1566
1567
0
      vres0 = _mm_packs_epi32( vres0, _mm_setzero_si128() );
1568
1569
0
      if( i + 3 < scaledWidth )
1570
0
      {
1571
0
        _mm_storeu_si64( (__m128i*) &dst[i], vres0 );
1572
0
      }
1573
0
      else if( i + 2 < scaledWidth )
1574
0
      {
1575
0
        _mm_storeu_si32( (__m128i*) &dst[i], vres0 );
1576
0
        dst[i + 2] = _mm_extract_epi16( vres0, 2 );
1577
0
      }
1578
0
      else if( i + 1 < scaledWidth )
1579
0
      {
1580
0
        _mm_storeu_si32( (__m128i*) &dst[i], vres0 );
1581
0
      }
1582
0
      else
1583
0
      {
1584
0
        dst[i] = _mm_extract_epi16( vres0, 0 );
1585
0
      }
1586
1587
0
      dst += scaledStride;
1588
0
    }
1589
0
  }
1590
1591
0
  delete[] tmpBuf;
1592
0
}
Unexecuted instantiation: void vvdec::sampleRateConvSIMD_8tap<(vvdec::x86_simd::X86_VEXT)1>(std::__1::pair<int, int>, std::__1::pair<int, int>, short const*, long, int, int, int, int, short*, long, int, int, int, int, int)
Unexecuted instantiation: void vvdec::sampleRateConvSIMD_8tap<(vvdec::x86_simd::X86_VEXT)4>(std::__1::pair<int, int>, std::__1::pair<int, int>, short const*, long, int, int, int, int, short*, long, int, int, int, int, int)
1593
1594
template<X86_VEXT vext>
1595
void sampleRateConvSIMD_4tap( const std::pair<int, int> scalingRatio,
1596
                              const std::pair<int, int> compScale,
1597
                              const Pel*                orgSrc,
1598
                              const ptrdiff_t           orgStride,
1599
                              const int                 orgWidth,
1600
                              const int                 orgHeight,
1601
                              const int                 beforeScaleLeftOffset,
1602
                              const int                 beforeScaleTopOffset,
1603
                              Pel*                      scaledSrc,
1604
                              const ptrdiff_t           scaledStride,
1605
                              const int                 scaledWidth,
1606
                              const int                 scaledHeight,
1607
                              const int                 afterScaleLeftOffset,
1608
                              const int                 afterScaleTopOffset,
1609
                              const int                 bitDepth,
1610
                              const bool                horCollocatedPositionFlag,
1611
                              const bool                verCollocatedPositionFlag )
1612
0
{
1613
0
  static constexpr bool useLumaFilter = false;
1614
1615
0
  const TFilterCoeff* filterHor = useLumaFilter ? &InterpolationFilter::m_lumaFilter[0][0] : &InterpolationFilter::m_chromaFilter[0][0];
1616
0
  const TFilterCoeff* filterVer = useLumaFilter ? &InterpolationFilter::m_lumaFilter[0][0] : &InterpolationFilter::m_chromaFilter[0][0];
1617
1618
0
  const int numFracPositions  = useLumaFilter ? 15 : 31;
1619
0
  const int numFracShift      = useLumaFilter ? 4 : 5;
1620
0
  const int posShiftX = SCALE_RATIO_BITS - numFracShift + compScale.first;
1621
0
  const int posShiftY = SCALE_RATIO_BITS - numFracShift + compScale.second;
1622
0
  int addX = (1 << (posShiftX - 1)) + (beforeScaleLeftOffset << SCALE_RATIO_BITS) + ((int( 1 - horCollocatedPositionFlag ) * 8 * (scalingRatio.first - SCALE_1X.first) + (1 << (2 + compScale.first))) >> (3 + compScale.first));
1623
0
  int addY = (1 << (posShiftY - 1)) + (beforeScaleTopOffset << SCALE_RATIO_BITS) + ((int( 1 - verCollocatedPositionFlag ) * 8 * (scalingRatio.second - SCALE_1X.second) + (1 << (2 + compScale.second))) >> (3 + compScale.second));
1624
1625
0
  const int filterLength = useLumaFilter ? NTAPS_LUMA : NTAPS_CHROMA;
1626
0
  const int log2Norm = 12;
1627
1628
0
  CHECK( bitDepth > 17, "Overflow may happen!" );
1629
0
  const int maxVal = (1 << bitDepth) - 1;
1630
1631
0
  const int tmpStride = ( ( scaledWidth + 3 ) / 4 ) * 4;
1632
0
  const int tmpHeight = ( ( orgHeight + 3 ) / 4 ) * 4;
1633
0
  int*      tmpBuf    = new int[tmpStride * tmpHeight];
1634
1635
0
  for( int j = 0; j < orgHeight; j += 4 )
1636
0
  {
1637
0
    const Pel* org0 = orgSrc +                                j * orgStride;
1638
0
    const Pel* org1 = orgSrc + std::min( j + 1, orgHeight - 1 ) * orgStride;
1639
0
    const Pel* org2 = orgSrc + std::min( j + 2, orgHeight - 1 ) * orgStride;
1640
0
    const Pel* org3 = orgSrc + std::min( j + 3, orgHeight - 1 ) * orgStride;
1641
1642
0
    _mm_prefetch( ( const char* ) (org0 + (orgStride << 1)), _MM_HINT_T0 );
1643
0
    _mm_prefetch( ( const char* ) (org1 + (orgStride << 1)), _MM_HINT_T0 );
1644
0
    _mm_prefetch( ( const char* ) (org2 + (orgStride << 1)), _MM_HINT_T0 );
1645
0
    _mm_prefetch( ( const char* ) (org3 + (orgStride << 1)), _MM_HINT_T0 );
1646
1647
0
    for( int i = 0; i < scaledWidth; i++ )
1648
0
    {
1649
0
      int refPos = (((i << compScale.first) - afterScaleLeftOffset) * scalingRatio.first + addX) >> posShiftX;
1650
0
      int integer = refPos >> numFracShift;
1651
0
      int frac = refPos & numFracPositions;
1652
1653
0
      const TFilterCoeff* f = filterHor + frac * filterLength;
1654
1655
0
      __m128i vsrc0, vsrc1, vsrc2, vsrc3;
1656
1657
0
      if( integer + 0 - (filterLength / 2) + 1 >= 0 && integer + (NTAPS_CHROMA - 1) - (filterLength / 2) + 1 < orgWidth )
1658
0
      {
1659
0
        int xInt = integer + 0 - (filterLength / 2) + 1;
1660
1661
0
        vsrc0 = _mm_loadu_si64( (const __m128i*) & org0[xInt] );
1662
0
        vsrc1 = _mm_loadu_si64( (const __m128i*) & org1[xInt] );
1663
0
        vsrc2 = _mm_loadu_si64( (const __m128i*) & org2[xInt] );
1664
0
        vsrc3 = _mm_loadu_si64( (const __m128i*) & org3[xInt] );
1665
0
      }
1666
0
      else
1667
0
      {
1668
0
        Pel src[4][NTAPS_CHROMA];
1669
1670
0
        for( int k = 0; k < filterLength; k++ )
1671
0
        {
1672
0
          int xInt = std::min<int>( std::max( 0, integer + k - filterLength / 2 + 1 ), orgWidth - 1 );
1673
1674
0
          src[0][k] = org0[xInt];
1675
0
          src[1][k] = org1[xInt];
1676
0
          src[2][k] = org2[xInt];
1677
0
          src[3][k] = org3[xInt];
1678
0
        }
1679
1680
0
        vsrc0 = _mm_loadu_si64( (const __m128i*) & src[0][0] );
1681
0
        vsrc1 = _mm_loadu_si64( (const __m128i*) & src[1][0] );
1682
0
        vsrc2 = _mm_loadu_si64( (const __m128i*) & src[2][0] );
1683
0
        vsrc3 = _mm_loadu_si64( (const __m128i*) & src[3][0] );
1684
0
      }
1685
1686
0
      __m128i vflt = _mm_loadu_si128( (const __m128i*) f );
1687
0
      vflt = _mm_unpacklo_epi64( vflt, vflt );
1688
1689
0
      __m128i vres0 = _mm_madd_epi16( _mm_unpacklo_epi64( vsrc0, vsrc1 ), vflt );
1690
0
      __m128i vres2 = _mm_madd_epi16( _mm_unpacklo_epi64( vsrc2, vsrc3 ), vflt );
1691
1692
0
      vres0 = _mm_hadd_epi32( vres0, vres2 );
1693
1694
0
      int* tmp = tmpBuf + i * tmpHeight + j;
1695
1696
0
      _mm_storeu_si128( (__m128i*) tmp, vres0 );
1697
0
    }
1698
0
  }
1699
1700
0
  __m128i vzero = _mm_setzero_si128();
1701
0
  __m128i vnorm = _mm_set1_epi32( (1 << (log2Norm - 1)) );
1702
1703
0
  for( int i = 0; i < scaledWidth; i += 4 )
1704
0
  {
1705
0
    Pel* dst = scaledSrc;
1706
1707
0
    int* tmp0 =                       tmpBuf + i       * tmpHeight;
1708
0
    int* tmp1 = i + 1 < scaledWidth ? tmpBuf + (i + 1) * tmpHeight : tmp0;
1709
0
    int* tmp2 = i + 2 < scaledWidth ? tmpBuf + (i + 2) * tmpHeight : tmp0;
1710
0
    int* tmp3 = i + 3 < scaledWidth ? tmpBuf + (i + 3) * tmpHeight : tmp0;
1711
1712
0
    _mm_prefetch( ( const char* ) (tmp0 + (tmpHeight << 2)), _MM_HINT_T0 );
1713
0
    _mm_prefetch( ( const char* ) (tmp1 + (tmpHeight << 2)), _MM_HINT_T0 );
1714
0
    _mm_prefetch( ( const char* ) (tmp2 + (tmpHeight << 2)), _MM_HINT_T0 );
1715
0
    _mm_prefetch( ( const char* ) (tmp3 + (tmpHeight << 2)), _MM_HINT_T0 );
1716
1717
0
    for( int j = 0; j < scaledHeight; j++ )
1718
0
    {
1719
0
      const int refPos = (((j << compScale.second) - afterScaleTopOffset) * scalingRatio.second + addY) >> posShiftY;
1720
0
      const int integer = refPos >> numFracShift;
1721
0
      const int frac = refPos & numFracPositions;
1722
0
      const TFilterCoeff* f = filterVer + frac * filterLength;
1723
1724
0
      __m128i vflt;
1725
0
      vflt = _mm_cvtepi16_epi32( _mm_loadu_si128( (const __m128i*)  f ) );
1726
1727
0
      __m128i vsrc0, vsrc1, vsrc2, vsrc3;
1728
1729
0
      if( integer + 0 - (filterLength / 2) + 1 >= 0 && integer + (NTAPS_CHROMA - 1) - (filterLength / 2) + 1 < orgHeight )
1730
0
      {
1731
0
        int yInt = integer + 0 - (filterLength / 2) + 1;
1732
1733
0
        vsrc0 = _mm_loadu_si128( (const __m128i*) &tmp0[yInt] );
1734
0
        vsrc1 = _mm_loadu_si128( (const __m128i*) &tmp1[yInt] );
1735
0
        vsrc2 = _mm_loadu_si128( (const __m128i*) &tmp2[yInt] );
1736
0
        vsrc3 = _mm_loadu_si128( (const __m128i*) &tmp3[yInt] );
1737
0
      }
1738
0
      else
1739
0
      {
1740
0
        int src[4][NTAPS_CHROMA];
1741
1742
0
        for( int k = 0; k < filterLength; k++ )
1743
0
        {
1744
0
          int yInt = std::min<int>( std::max( 0, integer + k - filterLength / 2 + 1 ), orgHeight - 1 );
1745
0
          src[0][k] = tmp0[yInt];
1746
0
          src[1][k] = tmp1[yInt];
1747
0
          src[2][k] = tmp2[yInt];
1748
0
          src[3][k] = tmp3[yInt];
1749
0
        }
1750
1751
0
        vsrc0 = _mm_loadu_si128( (const __m128i*) & src[0][0] );
1752
0
        vsrc1 = _mm_loadu_si128( (const __m128i*) & src[1][0] );
1753
0
        vsrc2 = _mm_loadu_si128( (const __m128i*) & src[2][0] );
1754
0
        vsrc3 = _mm_loadu_si128( (const __m128i*) & src[3][0] );
1755
0
      }
1756
1757
0
      __m128i vres0 = _mm_mullo_epi32( vsrc0, vflt );
1758
0
      __m128i vres1 = _mm_mullo_epi32( vsrc1, vflt );
1759
0
      __m128i vres2 = _mm_mullo_epi32( vsrc2, vflt );
1760
0
      __m128i vres3 = _mm_mullo_epi32( vsrc3, vflt );
1761
1762
0
      vres0 = _mm_hadd_epi32( vres0, vres1 );
1763
0
      vres2 = _mm_hadd_epi32( vres2, vres3 );
1764
1765
0
      vres0 = _mm_hadd_epi32( vres0, vres2 );
1766
1767
0
      vres0 = _mm_add_epi32( vres0, vnorm );
1768
0
      vres0 = _mm_srai_epi32( vres0, log2Norm );
1769
0
      vres0 = _mm_max_epi32( _mm_min_epi32( _mm_set1_epi32( maxVal ), vres0 ), vzero );
1770
1771
0
      vres0 = _mm_packs_epi32( vres0, _mm_setzero_si128() );
1772
1773
0
      if( i + 3 < scaledWidth )
1774
0
      {
1775
0
        _mm_storeu_si64( (__m128i*) & dst[i], vres0 );
1776
0
      }
1777
0
      else if( i + 2 < scaledWidth )
1778
0
      {
1779
0
        _mm_storeu_si32( (__m128i*) &dst[i], vres0 );
1780
0
        dst[i + 2] = _mm_extract_epi16( vres0, 2 );
1781
0
      }
1782
0
      else if( i + 1 < scaledWidth )
1783
0
      {
1784
0
        _mm_storeu_si32( (__m128i*) &dst[i], vres0 );
1785
0
      }
1786
0
      else
1787
0
      {
1788
0
        dst[i] = _mm_extract_epi16( vres0, 0 );
1789
0
      }
1790
1791
0
      dst += scaledStride;
1792
0
    }
1793
0
  }
1794
1795
0
  delete[] tmpBuf;
1796
0
}
Unexecuted instantiation: void vvdec::sampleRateConvSIMD_4tap<(vvdec::x86_simd::X86_VEXT)1>(std::__1::pair<int, int>, std::__1::pair<int, int>, short const*, long, int, int, int, int, short*, long, int, int, int, int, int, bool, bool)
Unexecuted instantiation: void vvdec::sampleRateConvSIMD_4tap<(vvdec::x86_simd::X86_VEXT)4>(std::__1::pair<int, int>, std::__1::pair<int, int>, short const*, long, int, int, int, int, short*, long, int, int, int, int, int, bool, bool)
1797
1798
template<X86_VEXT vext>
1799
void sampleRateConvSIMD( const std::pair<int, int> scalingRatio,
1800
                         const std::pair<int, int> compScale,
1801
                         const Pel*                orgSrc,
1802
                         const ptrdiff_t           orgStride,
1803
                         const int                 orgWidth,
1804
                         const int                 orgHeight,
1805
                         const int                 beforeScaleLeftOffset,
1806
                         const int                 beforeScaleTopOffset,
1807
                         Pel*                      scaledSrc,
1808
                         const ptrdiff_t           scaledStride,
1809
                         const int                 scaledWidth,
1810
                         const int                 scaledHeight,
1811
                         const int                 afterScaleLeftOffset,
1812
                         const int                 afterScaleTopOffset,
1813
                         const int                 bitDepth,
1814
                         const bool                useLumaFilter,
1815
                         const bool                horCollocatedPositionFlag,
1816
                         const bool                verCollocatedPositionFlag )
1817
0
{
1818
0
  if( orgWidth == scaledWidth && orgHeight == scaledHeight && scalingRatio == SCALE_1X && !beforeScaleLeftOffset && !beforeScaleTopOffset && !afterScaleLeftOffset && !afterScaleTopOffset )
1819
0
  {
1820
0
    g_pelBufOP.copyBuffer( ( const char* ) orgSrc, orgStride * sizeof( Pel ), ( char* ) scaledSrc, scaledStride * sizeof( Pel ), orgWidth * sizeof( Pel ), orgHeight );
1821
1822
0
    return;
1823
0
  }
1824
0
  else if( useLumaFilter )
1825
0
  {
1826
0
    sampleRateConvSIMD_8tap<vext>( scalingRatio, compScale, orgSrc, orgStride, orgWidth, orgHeight, beforeScaleLeftOffset, beforeScaleTopOffset, scaledSrc, scaledStride, scaledWidth, scaledHeight, afterScaleLeftOffset, afterScaleTopOffset, bitDepth );
1827
0
  }
1828
0
  else
1829
0
  {
1830
0
    sampleRateConvSIMD_4tap<vext>( scalingRatio, compScale, orgSrc, orgStride, orgWidth, orgHeight, beforeScaleLeftOffset, beforeScaleTopOffset, scaledSrc, scaledStride, scaledWidth, scaledHeight, afterScaleLeftOffset, afterScaleTopOffset, bitDepth, horCollocatedPositionFlag, verCollocatedPositionFlag );
1831
0
  }
1832
0
}
Unexecuted instantiation: void vvdec::sampleRateConvSIMD<(vvdec::x86_simd::X86_VEXT)1>(std::__1::pair<int, int>, std::__1::pair<int, int>, short const*, long, int, int, int, int, short*, long, int, int, int, int, int, bool, bool, bool)
Unexecuted instantiation: void vvdec::sampleRateConvSIMD<(vvdec::x86_simd::X86_VEXT)4>(std::__1::pair<int, int>, std::__1::pair<int, int>, short const*, long, int, int, int, int, short*, long, int, int, int, int, int, bool, bool, bool)
1833
1834
template<X86_VEXT vext>
1835
void PelBufferOps::_initPelBufOpsX86()
1836
0
{
1837
0
  addAvg16 = addAvg_SSE<vext, 16>;
1838
0
  addAvg8  = addAvg_SSE<vext,  8>;
1839
0
  addAvg4  = addAvg_SSE<vext,  4>;
1840
1841
0
  reco8 = reco_SSE<vext, 8>;
1842
0
  reco4 = reco_SSE<vext, 4>;
1843
1844
0
  linTf8 = linTf_SSE_entry<vext, 8>;
1845
0
  linTf4 = linTf_SSE_entry<vext, 4>;
1846
0
#if ENABLE_SIMD_OPT_GBI
1847
1848
0
  wghtAvg4 = addWghtAvg_SSE<vext, 4>;
1849
0
  wghtAvg8 = addWghtAvg_SSE<vext, 8>;
1850
0
#endif
1851
1852
0
  copyBuffer = copyBuffer_SSE<vext>;
1853
1854
0
  transpose4x4 = transposePel_SSE<vext, 4>;
1855
0
  transpose8x8 = transposePel_SSE<vext, 8>;
1856
1857
0
#if defined( REAL_TARGET_X86 ) // looks like those function only really work for x86 SIMD
1858
0
  if( vext >= AVX2 )
1859
0
    applyLut = applyLut_SIMD<vext>;
1860
0
  else
1861
0
    rspBcw = rspBcwCore_SIMD<vext>;
1862
1863
0
#endif
1864
0
  rspFwd = rspFwdCore_SIMD<vext>;
1865
1866
0
#if INTPTR_MAX == INT64_MAX || INTPTR_MAX == INT32_MAX
1867
0
  fillN_CU = fillN_CU_SIMD<vext>;
1868
0
#endif
1869
1870
0
  sampleRateConv = sampleRateConvSIMD<vext>;
1871
0
}
Unexecuted instantiation: void vvdec::PelBufferOps::_initPelBufOpsX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::PelBufferOps::_initPelBufOpsX86<(vvdec::x86_simd::X86_VEXT)4>()
1872
1873
template void PelBufferOps::_initPelBufOpsX86<SIMDX86>();
1874
1875
}
1876
1877
#endif // TARGET_SIMD_X86
1878
#endif // ENABLE_SIMD_OPT_BUFFER
1879
//! \}