Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/InterPredX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     InterPredX86.h
44
    \brief    SIMD for InterPrediction
45
*/
46
47
//! \ingroup CommonLib
48
//! \{
49
50
51
#include "CommonLib/CommonDef.h"
52
#include "CommonDefX86.h"
53
#include "CommonLib/InterPrediction.h"
54
55
namespace vvdec
56
{
57
58
#if ENABLE_SIMD_OPT_INTER
59
#ifdef TARGET_SIMD_X86
60
61
template<X86_VEXT vext>
62
inline void PaddBIO_SIMD( const Pel* refPel, Pel* dstPel, unsigned width, const int shift )
63
0
{
64
0
  int w;
65
0
  __m128i off = _mm_set1_epi16( ( Pel ) IF_INTERNAL_OFFS );
66
0
  __m128i vshift = _mm_cvtsi32_si128( shift );
67
68
0
  if( width > 4 )
69
0
  {
70
0
    for( w = 0; w < width; w += 8 )
71
0
    {
72
73
0
      __m128i ref = _mm_loadu_si128( ( __m128i const * )&refPel[w] );
74
0
      ref = _mm_sll_epi16( ref, vshift );
75
0
      ref = _mm_sub_epi16( ref, off );
76
0
      _mm_storeu_si128( ( __m128i * )&dstPel[w], ref );
77
78
0
    }
79
    //2 * BIO_EXTEND_SIZE
80
0
    __m128i ref = _mm_loadu_si128( ( __m128i const * )&refPel[w] );
81
0
    ref = _mm_sll_epi16( ref, vshift );
82
0
    ref = _mm_sub_epi16( ref, off );
83
0
    _mm_storeu_si32( ( __m128i * )&dstPel[w], ref );
84
85
0
  }
86
0
  else
87
0
  {
88
0
    __m128i ref = _mm_loadu_si128( ( __m128i const * )&refPel[0] );
89
0
    ref = _mm_sll_epi16( ref, vshift );
90
0
    ref = _mm_sub_epi16( ref, off );
91
0
    _mm_storeu_si64( ( __m128i * )&dstPel[0], ref );
92
0
    ref = _mm_srli_si128( ref, 8 );
93
0
    _mm_storeu_si32( ( __m128i * )&dstPel[4], ref );
94
0
  }
95
0
}
Unexecuted instantiation: void vvdec::PaddBIO_SIMD<(vvdec::x86_simd::X86_VEXT)1>(short const*, short*, unsigned int, int)
Unexecuted instantiation: void vvdec::PaddBIO_SIMD<(vvdec::x86_simd::X86_VEXT)4>(short const*, short*, unsigned int, int)
96
97
static inline int rightShiftMSB( int numer, int denom )
98
0
{
99
0
  unsigned int shiftIdx = bit_scan_reverse( denom );
100
0
  return ( numer >> shiftIdx );
101
0
}
Unexecuted instantiation: InterPred_sse41.cpp:vvdec::rightShiftMSB(int, int)
Unexecuted instantiation: InterPred_avx2.cpp:vvdec::rightShiftMSB(int, int)
102
103
template<X86_VEXT vext>
104
inline void addBIOAvg4_2x_SSE(const int16_t* src0,   const int16_t* src1,
105
                                           int16_t* dst,    ptrdiff_t dstStride,
106
                                     const int16_t* gradX0, const int16_t* gradX1,
107
                                     const int16_t* gradY0, const int16_t* gradY1,
108
                                          ptrdiff_t widthG,
109
                                           int tmpx0, int tmpy0, int tmpx1, int tmpy1,
110
                                           int shift, int offset, const ClpRng& clpRng)
111
0
{
112
0
  const ptrdiff_t src0Stride = widthG;
113
0
  const ptrdiff_t src1Stride = widthG;
114
0
  const ptrdiff_t gradStride = widthG;
115
  
116
0
  __m128i mm_tmpx0  = _mm_set1_epi32( ( tmpx0 & 0xffff ) | ( tmpy0 *(1<< 16 )) );
117
0
  __m128i mm_tmpx1  = _mm_set1_epi32( ( tmpx1 & 0xffff ) | ( tmpy1 *(1<< 16 )) );
118
0
  __m128i mm_offset = _mm_set1_epi32( offset );
119
0
  __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
120
0
  __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
121
0
  __m128i mm_a;
122
0
  __m128i mm_b;
123
0
  __m128i mm_sum;
124
125
0
  __m128i mm_gx0, mm_gx1, mm_gy0, mm_gy1, mm_s0, mm_s1, mm_tmp;
126
127
0
  for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
128
0
  {
129
0
    mm_gx0 = _mm_loadu_si128    ( ( const __m128i* ) gradX0 );
130
0
    mm_gx1 = _mm_loadu_si128    ( ( const __m128i* ) gradX1 );
131
0
    mm_gy0 = _mm_loadu_si128    ( ( const __m128i* ) gradY0 );
132
0
    mm_gy1 = _mm_loadu_si128    ( ( const __m128i* ) gradY1 );
133
0
    mm_s0  = _mm_loadu_si128    ( ( const __m128i* ) src0 );
134
0
    mm_s1  = _mm_loadu_si128    ( ( const __m128i* ) src1 );
135
136
0
    mm_a   = _mm_unpacklo_epi16 ( mm_gx0, mm_gy0 );
137
0
    mm_b   = _mm_unpacklo_epi16 ( mm_gx1, mm_gy1 );
138
0
    mm_a   = _mm_sub_epi16      ( mm_a, mm_b );
139
0
    mm_sum = _mm_madd_epi16     ( mm_a, mm_tmpx0 );
140
0
    mm_a   = _mm_cvtepi16_epi32 ( mm_s0 );
141
0
    mm_b   = _mm_cvtepi16_epi32 ( mm_s1 );
142
0
    mm_tmp = _mm_add_epi32      ( _mm_add_epi32( mm_sum, mm_a ), _mm_add_epi32( mm_b, mm_offset ) );
143
    
144
0
    mm_a   = _mm_unpackhi_epi16 ( mm_gx0, mm_gy0 );
145
0
    mm_b   = _mm_unpackhi_epi16 ( mm_gx1, mm_gy1 );
146
0
    mm_a   = _mm_sub_epi16      ( mm_a, mm_b );
147
0
    mm_sum = _mm_madd_epi16     ( mm_a, mm_tmpx1 );
148
0
    mm_a   = _mm_cvtepi16_epi32 ( _mm_unpackhi_epi64( mm_s0, mm_s0 ) );
149
0
    mm_b   = _mm_cvtepi16_epi32 ( _mm_unpackhi_epi64( mm_s1, mm_s0 ) );
150
0
    mm_sum = _mm_add_epi32      ( _mm_add_epi32( mm_sum, mm_a ), _mm_add_epi32( mm_b, mm_offset ) );
151
152
0
    mm_sum = _mm_packs_epi32    ( _mm_srai_epi32( mm_tmp, shift ), _mm_srai_epi32( mm_sum, shift ) );
153
0
    mm_sum = _mm_min_epi16      ( vibdimax, _mm_max_epi16( vibdimin, mm_sum ) );
154
155
0
    _mm_storeu_si128            ( (__m128i *) dst, mm_sum );
156
0
  }
157
0
}
158
159
#if USE_AVX2
160
template<>
161
inline void addBIOAvg4_2x_SSE<AVX2>(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx0, int tmpx1, int tmpy0, int tmpy1, int shift, int offset, const ClpRng& clpRng)
162
0
{
163
0
  const ptrdiff_t src0Stride = widthG;
164
0
  const ptrdiff_t src1Stride = widthG;
165
0
  const ptrdiff_t gradStride = widthG;
166
167
0
  __m256i mm_tmpx    = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( ( tmpx0 & 0xffff ) | ( tmpy0 *(1<< 16 )) ) ), _mm_set1_epi32( ( tmpx1 & 0xffff ) | ( tmpy1 *(1<< 16 )) ), 1 );
168
0
  __m256i mm_offset  = _mm256_set1_epi32( offset );
169
0
  __m256i vibdimin   = _mm256_set1_epi32( clpRng.min() );
170
0
  __m256i vibdimax   = _mm256_set1_epi32( clpRng.max() );
171
0
  __m256i mm_a;
172
0
  __m256i mm_b;
173
0
  __m256i mm_sum;
174
0
  __m128i xsrc0, xsrc1;
175
176
0
  for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
177
0
  {
178
0
    xsrc0  = _mm_loadu_si128       ( ( const __m128i * ) gradX0 );
179
0
    xsrc1  = _mm_loadu_si128       ( ( const __m128i * ) gradY0 );
180
0
    mm_a   = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
181
0
    mm_a   = _mm256_inserti128_si256( mm_a, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
182
0
    xsrc0  = _mm_loadu_si128       ( ( const __m128i * ) gradX1 );
183
0
    xsrc1  = _mm_loadu_si128       ( ( const __m128i * ) gradY1 );
184
0
    mm_b   = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
185
0
    mm_b   = _mm256_inserti128_si256( mm_b, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
186
0
    mm_a   = _mm256_sub_epi16      ( mm_a, mm_b );
187
0
    mm_sum = _mm256_madd_epi16     ( mm_a, mm_tmpx );
188
0
    mm_a   = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src0 ) ) );
189
0
    mm_b   = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src1 ) ) );
190
0
    mm_sum = _mm256_add_epi32      ( _mm256_add_epi32( mm_sum, mm_a ), _mm256_add_epi32( mm_b, mm_offset ) );
191
0
    mm_sum = _mm256_srai_epi32     ( mm_sum, shift );
192
0
    mm_sum = _mm256_min_epi32      ( vibdimax, _mm256_max_epi32( vibdimin, mm_sum ) );
193
0
    _mm_storeu_si128               ( (__m128i *) dst, _mm256_cvtepi32_epi16x( mm_sum ) );
194
0
  }
195
0
}
196
#endif
197
198
template< X86_VEXT vext >
199
static inline void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx, int &tmpy)
200
0
{
201
0
  static constexpr int shift4 = 4;
202
0
  static constexpr int shift5 = 1;
203
0
  const int srcStride = widthG;
204
205
0
  __m128i sumAbsGXTmp    = _mm_setzero_si128();
206
0
  __m128i sumDIXTmp      = _mm_setzero_si128();
207
0
  __m128i sumAbsGYTmp    = _mm_setzero_si128();
208
0
  __m128i sumDIYTmp      = _mm_setzero_si128();
209
0
  __m128i sumSignGyGxTmp = _mm_setzero_si128();
210
211
0
  for (int y = 0; y < 6; y++)
212
0
  {
213
0
    __m128i shiftSrcY0Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Tmp)), shift4);
214
0
    __m128i shiftSrcY1Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Tmp)), shift4);
215
0
    __m128i loadGradX0 = _mm_loadu_si128((__m128i*)(gradX0));
216
0
    __m128i loadGradX1 = _mm_loadu_si128((__m128i*)(gradX1));
217
0
    __m128i loadGradY0 = _mm_loadu_si128((__m128i*)(gradY0));
218
0
    __m128i loadGradY1 = _mm_loadu_si128((__m128i*)(gradY1));
219
0
    __m128i subTemp1  = _mm_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
220
0
    __m128i packTempX = _mm_srai_epi16(_mm_add_epi16(loadGradX0, loadGradX1), shift5);
221
0
    __m128i packTempY = _mm_srai_epi16(_mm_add_epi16(loadGradY0, loadGradY1), shift5);
222
0
    __m128i gX = _mm_abs_epi16(packTempX);
223
0
    __m128i gY = _mm_abs_epi16(packTempY);
224
0
    __m128i dIX       = _mm_sign_epi16(subTemp1,  packTempX );
225
0
    __m128i dIY       = _mm_sign_epi16(subTemp1,  packTempY );
226
0
    __m128i signGY_GX = _mm_sign_epi16(packTempX, packTempY );
227
228
0
    sumAbsGXTmp     = _mm_add_epi16(sumAbsGXTmp, gX);
229
0
    sumDIXTmp       = _mm_add_epi16(sumDIXTmp, dIX);
230
0
    sumAbsGYTmp     = _mm_add_epi16(sumAbsGYTmp, gY);
231
0
    sumDIYTmp       = _mm_add_epi16(sumDIYTmp, dIY);
232
0
    sumSignGyGxTmp  = _mm_add_epi16(sumSignGyGxTmp, signGY_GX);
233
0
    srcY0Tmp += srcStride;
234
0
    srcY1Tmp += srcStride;
235
0
    gradX0 += widthG;
236
0
    gradX1 += widthG;
237
0
    gradY0 += widthG;
238
0
    gradY1 += widthG;
239
0
  }
240
241
0
  sumAbsGXTmp    = _mm_madd_epi16(sumAbsGXTmp,    _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
242
0
  sumDIXTmp      = _mm_madd_epi16(sumDIXTmp,      _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
243
0
  sumAbsGYTmp    = _mm_madd_epi16(sumAbsGYTmp,    _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
244
0
  sumDIYTmp      = _mm_madd_epi16(sumDIYTmp,      _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
245
0
  sumSignGyGxTmp = _mm_madd_epi16(sumSignGyGxTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
246
247
0
  __m128i a12 = _mm_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
248
0
  __m128i a3  = _mm_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
249
0
  __m128i b12 = _mm_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
250
0
  __m128i b3  = _mm_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
251
0
  __m128i c1  = _mm_unpacklo_epi64(a12, b12);
252
0
  __m128i c2  = _mm_unpackhi_epi64(a12, b12);
253
0
  __m128i c3  = _mm_unpacklo_epi64(a3, b3);
254
255
0
  c1 = _mm_add_epi32(c1, c2);
256
0
  c1 = _mm_add_epi32(c1, c3);
257
258
0
  int sumAbsGX = _mm_cvtsi128_si32(c1);
259
0
  int sumAbsGY = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0x55));
260
0
  int sumDIX   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xaa));
261
0
  int sumDIY   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xff));
262
263
0
  sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
264
0
  sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
265
0
  int sumSignGY_GX  = _mm_cvtsi128_si32(sumSignGyGxTmp);
266
267
0
  tmpx = sumAbsGX == 0 ? 0 : rightShiftMSB( sumDIX *4, sumAbsGX );
268
0
  tmpx = Clip3( -limit, limit, tmpx );
269
270
0
  int mainsGxGy = sumSignGY_GX >> 12;
271
0
  int secsGxGy  = sumSignGY_GX & ( ( 1 << 12 ) - 1 );
272
0
  int tmpData   = tmpx * mainsGxGy;
273
0
  tmpData       = ( ( tmpData *(1<< 12 )) + tmpx * secsGxGy ) >> 1;
274
0
  tmpy = sumAbsGY == 0 ? 0 : rightShiftMSB( ( ( sumDIY *(1<< 2 )) - tmpData ), sumAbsGY );
275
0
  tmpy = Clip3( -limit, limit, tmpy );
276
0
}
277
278
#if USE_AVX2
279
static inline void calcBIOSums2x_AVX2(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx0, int &tmpx1, int &tmpy0, int &tmpy1 )
280
0
{
281
0
  static constexpr int shift4 = 4;
282
0
  static constexpr int shift5 = 1;
283
0
  const int srcStride = widthG;
284
  
285
0
  __m256i sumAbsGXTmp     = _mm256_setzero_si256();
286
0
  __m256i sumDIXTmp       = _mm256_setzero_si256();
287
0
  __m256i sumAbsGYTmp     = _mm256_setzero_si256();
288
0
  __m256i sumDIYTmp       = _mm256_setzero_si256();
289
0
  __m256i sumSignGyGxTmp  = _mm256_setzero_si256();
290
291
0
#define _mm256_load2_si128_offset4(addr) _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*) &addr[0])), _mm_loadu_si128((const __m128i*) &addr[4]), 1 )
292
293
0
  for (int y = 0; y < 6; y++)
294
0
  {
295
0
    __m256i shiftSrcY0Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY0Tmp), shift4);
296
0
    __m256i shiftSrcY1Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY1Tmp), shift4);
297
0
    __m256i loadGradX0    = _mm256_load2_si128_offset4(gradX0);
298
0
    __m256i loadGradX1    = _mm256_load2_si128_offset4(gradX1);
299
0
    __m256i loadGradY0    = _mm256_load2_si128_offset4(gradY0);
300
0
    __m256i loadGradY1    = _mm256_load2_si128_offset4(gradY1);
301
0
    __m256i subTemp1      = _mm256_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
302
0
    __m256i packTempX     = _mm256_srai_epi16(_mm256_add_epi16(loadGradX0, loadGradX1), shift5);
303
0
    __m256i packTempY     = _mm256_srai_epi16(_mm256_add_epi16(loadGradY0, loadGradY1), shift5);
304
0
    __m256i gX            = _mm256_abs_epi16(packTempX);
305
0
    __m256i gY            = _mm256_abs_epi16(packTempY);
306
0
    __m256i dIX           = _mm256_sign_epi16(subTemp1,  packTempX );
307
0
    __m256i dIY           = _mm256_sign_epi16(subTemp1,  packTempY );
308
0
    __m256i signGY_GX     = _mm256_sign_epi16(packTempX, packTempY );
309
310
0
    sumAbsGXTmp     = _mm256_add_epi16(sumAbsGXTmp, gX);
311
0
    sumDIXTmp       = _mm256_add_epi16(sumDIXTmp, dIX);
312
0
    sumAbsGYTmp     = _mm256_add_epi16(sumAbsGYTmp, gY);
313
0
    sumDIYTmp       = _mm256_add_epi16(sumDIYTmp, dIY);
314
0
    sumSignGyGxTmp  = _mm256_add_epi16(sumSignGyGxTmp, signGY_GX);
315
316
0
    srcY0Tmp += srcStride;
317
0
    srcY1Tmp += srcStride;
318
0
    gradX0 += widthG;
319
0
    gradX1 += widthG;
320
0
    gradY0 += widthG;
321
0
    gradY1 += widthG;
322
0
  }
323
324
0
#undef _mm256_load2_si128_offset4
325
326
0
  sumAbsGXTmp    = _mm256_madd_epi16(sumAbsGXTmp,    _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
327
0
  sumDIXTmp      = _mm256_madd_epi16(sumDIXTmp,      _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
328
0
  sumAbsGYTmp    = _mm256_madd_epi16(sumAbsGYTmp,    _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
329
0
  sumDIYTmp      = _mm256_madd_epi16(sumDIYTmp,      _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
330
0
  sumSignGyGxTmp = _mm256_madd_epi16(sumSignGyGxTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
331
332
0
  __m256i a12 = _mm256_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
333
0
  __m256i a3  = _mm256_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
334
0
  __m256i b12 = _mm256_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
335
0
  __m256i b3  = _mm256_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
336
0
  __m256i c1  = _mm256_unpacklo_epi64(a12, b12);
337
0
  __m256i c2  = _mm256_unpackhi_epi64(a12, b12);
338
0
  __m256i c3  = _mm256_unpacklo_epi64(a3, b3);
339
340
0
  c1 = _mm256_add_epi32(c1, c2);
341
0
  c1 = _mm256_add_epi32(c1, c3);
342
343
0
  int tmpData[8];
344
345
0
  _mm256_storeu_si256( ( __m256i* ) &tmpData[0], c1 );
346
347
0
  #define sumAbsGX0 tmpData[0]
348
0
  #define sumAbsGX1 tmpData[4]
349
350
0
  #define sumAbsGY0 tmpData[1]
351
0
  #define sumAbsGY1 tmpData[5]
352
353
0
  #define sumDIX0   tmpData[2]
354
0
  #define sumDIX1   tmpData[6]
355
356
0
  #define sumDIY0   tmpData[3]
357
0
  #define sumDIY1   tmpData[7]
358
359
0
  sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
360
0
  sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
361
362
0
  int sumSignGY_GX0 = _mm256_extract_epi32( sumSignGyGxTmp, 0 );
363
0
  int sumSignGY_GX1 = _mm256_extract_epi32( sumSignGyGxTmp, 4 );
364
365
#if 0
366
  tmpx0 = sumAbsGX0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX0 << 2, sumAbsGX0 ) );
367
  tmpx1 = sumAbsGX1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX1 << 2, sumAbsGX1 ) );
368
  __m128i vtmpx         = _mm_setr_epi32 ( tmpx0, tmpx1, 0, 0 );
369
  __m128i vsumSignGY_GX = _mm_setr_epi32 ( sumSignGY_GX0, sumSignGY_GX1, 0, 0 );
370
  __m128i vmainsGxGy    = _mm_srai_epi32 ( vsumSignGY_GX, 12 );
371
  __m128i vsecsGxGy     = _mm_and_si128  ( vsumSignGY_GX, _mm_set1_epi32( ( 1 << 12 ) - 1 ) );
372
  __m128i vtmpData      = _mm_mullo_epi32( vtmpx, vmainsGxGy );
373
  vtmpData              = _mm_slli_epi32 ( vtmpData, 12 );
374
  vtmpData              = _mm_add_epi32  ( vtmpData, _mm_mullo_epi32( vtmpx, vsecsGxGy ) );
375
  vtmpData              = _mm_srai_epi32 ( vtmpData, 1 );
376
  __m128i vtmpyIn       = _mm_slli_epi32 ( _mm_setr_epi32( sumDIY0, sumDIY1, 0, 0 ), 2 );
377
  vtmpyIn               = _mm_sub_epi32  ( vtmpyIn, vtmpData );
378
379
  tmpy0 = sumAbsGY0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 0 ), sumAbsGY0 ) );
380
  tmpy1 = sumAbsGY1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 1 ), sumAbsGY1 ) );
381
#else
382
0
  tmpx0 = sumAbsGX0 == 0 ? 0 : rightShiftMSB( sumDIX0 *(1<< 2), sumAbsGX0 );
383
0
  tmpx0 = Clip3( -limit, limit, tmpx0 );
384
385
0
  int mainsGxGy0 = sumSignGY_GX0 >> 12;
386
0
  int secsGxGy0  = sumSignGY_GX0 & ( ( 1 << 12 ) - 1 );
387
0
  int tmpData0   = tmpx0 * mainsGxGy0;
388
0
  tmpData0       = ( ( tmpData0 *(1<< 12 )) + tmpx0 * secsGxGy0 ) >> 1;
389
0
  tmpy0 = sumAbsGY0 == 0 ? 0 : rightShiftMSB( ( ( sumDIY0 *(1<< 2 )) - tmpData0 ), sumAbsGY0 );
390
0
  tmpy0 = Clip3( -limit, limit, tmpy0 );
391
392
393
0
  tmpx1 = sumAbsGX1 == 0 ? 0 : rightShiftMSB( sumDIX1 *(1<< 2), sumAbsGX1 );
394
0
  tmpx1 = Clip3( -limit, limit, tmpx1 );
395
396
0
  int mainsGxGy1 = sumSignGY_GX1 >> 12;
397
0
  int secsGxGy1  = sumSignGY_GX1 & ( ( 1 << 12 ) - 1 );
398
0
  int tmpData1   = tmpx1 * mainsGxGy1;
399
0
  tmpData1 = ( ( tmpData1 *(1<< 12 )) + tmpx1 * secsGxGy1 ) >> 1;
400
0
  tmpy1 = sumAbsGY1 == 0 ? 0 : rightShiftMSB( ( ( sumDIY1 *(1<< 2) ) - tmpData1 ), sumAbsGY1 );
401
0
  tmpy1 = Clip3( -limit, limit, tmpy1 );
402
0
#endif
403
404
0
#undef sumAbsGX0
405
0
#undef sumAbsGX1
406
0
#undef sumAbsGY0
407
0
#undef sumAbsGY1
408
0
#undef sumDIX0  
409
0
#undef sumDIX1  
410
0
#undef sumDIY0  
411
0
#undef sumDIY1  
412
0
}
413
#endif
414
415
template< X86_VEXT vext>
416
void BiOptFlowCoreSIMD( const Pel* srcY0,
417
                        const Pel* srcY1,
418
                        const Pel* gradX0,
419
                        const Pel* gradX1,
420
                        const Pel* gradY0,
421
                        const Pel* gradY1,
422
                        const int  width,
423
                        const int  height,
424
                              Pel* dstY,
425
                        const ptrdiff_t dstStride,
426
                        const int  shiftNum,
427
                        const int  offset,
428
                        const int  limit,
429
                        const ClpRng& clpRng,
430
                        const int bitDepth )
431
0
{
432
0
  const int widthG        = width  + BIO_ALIGN_SIZE;
433
0
  const int stridePredMC  = widthG;
434
0
        int offsetPos     = widthG * BIO_EXTEND_SIZE + BIO_EXTEND_SIZE;
435
0
  const int xUnit         = ( width  >> 2 );
436
0
  const int yUnit         = ( height >> 2 );
437
438
0
  const Pel* srcY0Temp;
439
0
  const Pel* srcY1Temp;
440
0
        Pel *dstY0;
441
  
442
0
  int OffPos;
443
0
  int OffPad = 0;
444
445
0
  int tmpx0, tmpy0, tmpx1, tmpy1;
446
447
0
  for( int yu = 0; yu < yUnit; yu++, srcY0 += ( stridePredMC << 2 ), srcY1 += ( stridePredMC << 2 ), dstY += ( dstStride << 2 ), offsetPos += ( widthG << 2 ) )
448
0
  {
449
0
    srcY0Temp = srcY0;
450
0
    srcY1Temp = srcY1;
451
0
    dstY0     = dstY;
452
    
453
0
    OffPos = offsetPos;
454
0
    OffPad = ( ( yu * widthG ) << 2 );
455
456
0
    for( int xu = 0; xu < xUnit; xu += 2, srcY0Temp += 8, srcY1Temp += 8, dstY0 += 8, OffPos += 8, OffPad += 8 )
457
0
    {
458
#if USE_AVX2
459
      calcBIOSums2x_AVX2( srcY0Temp, srcY1Temp,
460
                          gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad,
461
                          stridePredMC,
462
                          bitDepth, limit,
463
                          tmpx0, tmpx1, tmpy0, tmpy1 );
464
465
      addBIOAvg4_2x_SSE<vext>( srcY0Temp + stridePredMC + 1,
466
                               srcY1Temp + stridePredMC + 1,
467
                               dstY0, dstStride,
468
                               gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG,
469
                               tmpx0, tmpx1, tmpy0, tmpy1,
470
                               shiftNum, offset, clpRng );
471
#else
472
      calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0,
473
                             gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0,
474
                             stridePredMC,
475
                             bitDepth, limit,
476
                             tmpx0, tmpy0 );
477
      calcBIOSums_SSE<vext>( srcY0Temp + 4, srcY1Temp + 4,
478
                             gradX0 + OffPad + 4, gradX1 + OffPad + 4, gradY0 + OffPad + 4, gradY1 + OffPad + 4,
479
                             stridePredMC,
480
                             bitDepth, limit,
481
                             tmpx1, tmpy1 );
482
483
      addBIOAvg4_2x_SSE<vext>( srcY0Temp + stridePredMC + 1,
484
                               srcY1Temp + stridePredMC + 1,
485
                               dstY0, dstStride,
486
                               gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG,
487
                               tmpx0, tmpy0, tmpx1, tmpy1,
488
                               shiftNum, offset, clpRng );
489
#endif
490
0
    }  // xu
491
0
  }  // yu
492
#if USE_AVX2
493
494
  _mm256_zeroupper();
495
#endif
496
0
}
Unexecuted instantiation: void vvdec::BiOptFlowCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(short const*, short const*, short const*, short const*, short const*, short const*, int, int, short*, long, int, int, int, vvdec::ClpRngTemplate<short> const&, int)
Unexecuted instantiation: void vvdec::BiOptFlowCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(short const*, short const*, short const*, short const*, short const*, short const*, int, int, short*, long, int, int, int, vvdec::ClpRngTemplate<short> const&, int)
497
498
template< X86_VEXT vext, bool bi >
499
void applyPROF_SSE(Pel* dstPel, ptrdiff_t dstStride, const Pel* srcPel, const Pel* gradX, const Pel* gradY, const int* dMvX, const int* dMvY, int shiftNum, Pel offset, const ClpRng& clpRng)
500
0
{
501
0
  static constexpr ptrdiff_t srcStride  = 6;
502
0
  static constexpr ptrdiff_t gradStride = 4;
503
0
  static constexpr ptrdiff_t dMvStride  = 4;
504
505
0
  const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
506
507
#if USE_AVX2
508
  __m256i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0, mm_src;
509
  __m256i mm_offset = _mm256_set1_epi16( offset );
510
  __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
511
  __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
512
  __m256i mm_dimin  = _mm256_set1_epi32( -dILimit );
513
  __m256i mm_dimax  = _mm256_set1_epi32( dILimit - 1 );
514
515
  const int *vX0 = dMvX, *vY0 = dMvY;
516
  const Pel *gX0 = gradX, *gY0 = gradY;
517
518
  // first two rows
519
  mm_dmvx = _mm256_loadu_si256( ( const __m256i * ) vX0 );
520
  mm_dmvy = _mm256_loadu_si256( ( const __m256i * ) vY0 );
521
522
  mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
523
  mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
524
525
  mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si64( ( __m128i* )gX0 ) ), _mm_loadu_si64( ( __m128i* )( gX0 + gradStride ) ), 1 );
526
  mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si64( ( __m128i* )gY0 ) ), _mm_loadu_si64( ( __m128i* )( gY0 + gradStride ) ), 1 );
527
  
528
  mm_dI0   = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
529
  mm_dI0   = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI0 ) );
530
531
  // next two rows
532
  vX0 += ( dMvStride << 1 ); vY0 += ( dMvStride << 1 ); gX0 += ( gradStride << 1 ); gY0 += ( gradStride << 1 );
533
  
534
  mm_dmvx = _mm256_loadu_si256( ( const __m256i * ) vX0 );
535
  mm_dmvy = _mm256_loadu_si256( ( const __m256i * ) vY0 );
536
537
  mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
538
  mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
539
540
  mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si64( ( __m128i* )gX0 ) ), _mm_loadu_si64( ( __m128i* )( gX0 + gradStride ) ), 1 );
541
  mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si64( ( __m128i* )gY0 ) ), _mm_loadu_si64( ( __m128i* )( gY0 + gradStride ) ), 1 );
542
  
543
  mm_dI    = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
544
  mm_dI    = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI ) );
545
546
  // combine four rows
547
  mm_dI = _mm256_packs_epi32( mm_dI0, mm_dI );
548
  const Pel* src0 = srcPel + srcStride;
549
  mm_src = _mm256_inserti128_si256(
550
    _mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadu_si64((const __m128i *)srcPel), _mm_loadu_si64((const __m128i *)(srcPel + (srcStride << 1))))),
551
    _mm_unpacklo_epi64(_mm_loadu_si64((const __m128i *)src0), _mm_loadu_si64((const __m128i *)(src0 + (srcStride << 1)))),
552
    1
553
  );
554
  mm_dI = _mm256_add_epi16(mm_dI, mm_src);
555
0
  if (!bi)
556
0
  {
557
0
    mm_dI = _mm256_srai_epi16(_mm256_adds_epi16(mm_dI, mm_offset), shiftNum);
558
0
    mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI));
559
0
  }
560
561
  // store final results
562
  __m128i dITmp = _mm256_extracti128_si256(mm_dI, 1);
563
  Pel* dst0 = dstPel;
564
  _mm_storeu_si64((__m128i *)dst0, _mm256_castsi256_si128(mm_dI));
565
  dst0 += dstStride; _mm_storeu_si64((__m128i *)dst0, dITmp);
566
  dst0 += dstStride; _mm_storeu_si64((__m128i *)dst0, _mm_unpackhi_epi64(_mm256_castsi256_si128(mm_dI), _mm256_castsi256_si128(mm_dI)));
567
  dst0 += dstStride; _mm_storeu_si64((__m128i *)dst0, _mm_unpackhi_epi64(dITmp, dITmp));
568
#else
569
  __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0;
570
  __m128i mm_offset = _mm_set1_epi16( offset );
571
  __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
572
  __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
573
  __m128i mm_dimin  = _mm_set1_epi32( -dILimit );
574
  __m128i mm_dimax  = _mm_set1_epi32( dILimit - 1 );
575
  
576
  static constexpr int height = 4;
577
578
0
  for( int h = 0; h < height; h += 2 )
579
0
  {
580
0
    const int* vX = dMvX;
581
0
    const int* vY = dMvY;
582
0
    const Pel* gX = gradX;
583
0
    const Pel* gY = gradY;
584
0
    const Pel* src = srcPel;
585
0
    Pel*       dst = dstPel;
586
587
    // first row
588
0
    mm_dmvx  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vX ), _mm_setzero_si128() );
589
0
    mm_dmvy  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vY ), _mm_setzero_si128() );
590
0
    mm_gradx = _mm_loadu_si64 ( ( __m128i* ) gX );
591
0
    mm_grady = _mm_loadu_si64 ( ( __m128i* ) gY );
592
0
    mm_dI0   = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
593
0
    mm_dI0   = _mm_min_epi32  ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI0 ) );
594
595
    // second row
596
0
    mm_dmvx  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vX + dMvStride ) ), _mm_setzero_si128() );
597
0
    mm_dmvy  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vY + dMvStride ) ), _mm_setzero_si128() );
598
0
    mm_gradx = _mm_loadu_si64 ( ( __m128i* ) ( gX + gradStride ) );
599
0
    mm_grady = _mm_loadu_si64 ( ( __m128i* ) ( gY + gradStride ) );
600
0
    mm_dI    = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
601
0
    mm_dI    = _mm_min_epi32  ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI ) );
602
603
    // combine both rows
604
0
    mm_dI = _mm_packs_epi32( mm_dI0, mm_dI );
605
0
    mm_dI = _mm_add_epi16  ( _mm_unpacklo_epi64( _mm_loadu_si64( ( const __m128i * )src ), _mm_loadu_si64( ( const __m128i * )( src + srcStride ) ) ), mm_dI );
606
0
    if (!bi)
607
0
    {
608
0
      mm_dI = _mm_srai_epi16(_mm_adds_epi16(mm_dI, mm_offset), shiftNum);
609
0
      mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI));
610
0
    }
611
612
0
    _mm_storeu_si64( ( __m128i * )  dst,                                   mm_dI );
613
0
    _mm_storeu_si64( ( __m128i * )( dst + dstStride ), _mm_unpackhi_epi64( mm_dI, mm_dI ) );
614
615
0
    dMvX   += (dMvStride  << 1);
616
0
    dMvY   += (dMvStride  << 1);
617
0
    gradX  += (gradStride << 1);
618
0
    gradY  += (gradStride << 1);
619
0
    srcPel += (srcStride  << 1);
620
0
    dstPel += (dstStride  << 1);
621
0
  }
622
#endif
623
0
}
Unexecuted instantiation: void vvdec::applyPROF_SSE<(vvdec::x86_simd::X86_VEXT)1, false>(short*, long, short const*, short const*, short const*, int const*, int const*, int, short, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::applyPROF_SSE<(vvdec::x86_simd::X86_VEXT)1, true>(short*, long, short const*, short const*, short const*, int const*, int const*, int, short, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::applyPROF_SSE<(vvdec::x86_simd::X86_VEXT)4, false>(short*, long, short const*, short const*, short const*, int const*, int const*, int, short, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::applyPROF_SSE<(vvdec::x86_simd::X86_VEXT)4, true>(short*, long, short const*, short const*, short const*, int const*, int const*, int, short, vvdec::ClpRngTemplate<short> const&)
624
625
626
template< X86_VEXT vext >
627
void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit)
628
0
{
629
0
  CHECKD(size % 16 != 0, "Size must be multiple of 16!");
630
#ifdef USE_AVX512
631
  if (vext >= AVX512 && size >= 16)
632
  {
633
    __m512i dMvMin = _mm256_set1_epi32(-dmvLimit);
634
    __m512i dMvMax = _mm256_set1_epi32(dmvLimit);
635
    __m512i nOffset = _mm512_set1_epi32((1 << (nShift - 1)));
636
    __m512i vones = _mm512_set1_epi32(1);
637
    __m512i vzero = _mm512_setzero_si512();
638
    for (int i = 0; i < size; i += 16, v += 16)
639
    {
640
      __m512i src = _mm512_loadu_si512(v);
641
      __mmask16 mask = _mm512_cmpge_epi32_mask(src, vzero);
642
      src = __mm512_add_epi32(src, nOffset);
643
      __mm512i dst = _mm512_srai_epi32(_mm512_mask_sub_epi32(src, mask, src, vones), nShift);
644
      dst = _mm512_min_epi32(dMvMax, _mm512_max_epi32(dMvMin, dst));
645
      _mm512_storeu_si512(v, dst);
646
    }
647
  }
648
  else
649
#endif
650
#ifdef USE_AVX2
651
0
  if (vext >= AVX2 && size >= 8)
652
0
  {
653
0
    __m256i dMvMin = _mm256_set1_epi32(-dmvLimit);
654
0
    __m256i dMvMax = _mm256_set1_epi32(dmvLimit);
655
0
    __m256i nOffset = _mm256_set1_epi32(1 << (nShift - 1));
656
0
    __m256i vzero = _mm256_setzero_si256();
657
0
    for (int i = 0; i < size; i += 8, v += 8)
658
0
    {
659
0
      __m256i src = _mm256_lddqu_si256((__m256i*)v);
660
0
      __m256i of  = _mm256_cmpgt_epi32(src, vzero);
661
0
      __m256i dst = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(src, nOffset), of), nShift);
662
0
      dst = _mm256_min_epi32(dMvMax, _mm256_max_epi32(dMvMin, dst));
663
0
      _mm256_storeu_si256((__m256i*)v, dst);
664
0
    }
665
0
  }
666
0
  else
667
0
#endif
668
0
  {
669
0
    __m128i dMvMin = _mm_set1_epi32(-dmvLimit);
670
0
    __m128i dMvMax = _mm_set1_epi32(dmvLimit);
671
0
    __m128i nOffset = _mm_set1_epi32((1 << (nShift - 1)));
672
0
    __m128i vzero = _mm_setzero_si128();
673
0
    for (int i = 0; i < size; i += 4, v += 4)
674
0
    {
675
0
      __m128i src = _mm_loadu_si128((__m128i*)v);
676
0
      __m128i of  = _mm_cmpgt_epi32(src, vzero);
677
0
      __m128i dst = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(src, nOffset), of), nShift);
678
0
      dst = _mm_min_epi32(dMvMax, _mm_max_epi32(dMvMin, dst));
679
0
      _mm_storeu_si128((__m128i*)v, dst);
680
0
    }
681
0
  }
682
0
}
Unexecuted instantiation: void vvdec::roundIntVector_SIMD<(vvdec::x86_simd::X86_VEXT)1>(int*, int, unsigned int, int)
Unexecuted instantiation: void vvdec::roundIntVector_SIMD<(vvdec::x86_simd::X86_VEXT)4>(int*, int, unsigned int, int)
683
684
template< X86_VEXT vext, bool PAD = true>
685
void gradFilter_SSE( int16_t* src, ptrdiff_t _srcStride, int width, int height, ptrdiff_t _gradStride, int16_t* gradX, int16_t* gradY, const int bitDepth)
686
0
{
687
0
  const int widthInside      = PAD ? width  - 2 * BIO_EXTEND_SIZE : 4;
688
0
  const int heightInside     = PAD ? height - 2 * BIO_EXTEND_SIZE : 4;
689
0
  const ptrdiff_t gradStride = PAD ? _gradStride                  : 4;
690
0
  const ptrdiff_t srcStride  = PAD ? _srcStride                   : 6;
691
692
0
  int16_t* srcTmp   = PAD ? src   + srcStride  + 1 : src;
693
0
  int16_t* gradXTmp = PAD ? gradX + gradStride + 1 : gradX;
694
0
  int16_t* gradYTmp = PAD ? gradY + gradStride + 1 : gradY;
695
696
0
  const int shift1 = std::max<int>( 6, bitDepth - 6 );
697
698
0
  CHECKD( gradStride != _gradStride, "Wrong PROF stride!" );
699
0
  CHECKD( srcStride  != _srcStride,  "Wrong PROF stride!" );
700
701
#if USE_AVX2
702
0
  if( PAD && ( widthInside & 15 ) == 0 && vext >= AVX2 )
703
0
  {
704
0
    for( int y = 0; y < heightInside; y++ )
705
0
    {
706
0
      for( int x = 0; x < widthInside; x += 16 )
707
0
      {
708
0
        __m256i mmPixTop    = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x - srcStride ) ), shift1 );
709
0
        __m256i mmPixBottom = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x + srcStride ) ), shift1 );
710
0
        __m256i mmPixLeft   = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x - 1 ) ),         shift1 );
711
0
        __m256i mmPixRight  = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x + 1 ) ),         shift1 );
712
713
        __m256i mmGradVer = _mm256_sub_epi16( mmPixBottom, mmPixTop );
714
        __m256i mmGradHor = _mm256_sub_epi16( mmPixRight, mmPixLeft );
715
716
        _mm256_storeu_si256( ( __m256i * )( gradYTmp + x ), mmGradVer );
717
        _mm256_storeu_si256( ( __m256i * )( gradXTmp + x ), mmGradHor );
718
      }
719
720
      gradXTmp[widthInside] = gradXTmp[widthInside - 1];
721
      gradYTmp[widthInside] = gradYTmp[widthInside - 1];
722
      srcTmp  [widthInside] = srcTmp  [widthInside - 1];
723
      gradXTmp[-1]          = gradXTmp[0];
724
      gradYTmp[-1]          = gradYTmp[0];
725
      srcTmp  [-1]          = srcTmp  [0];
726
727
      gradXTmp += gradStride;
728
      gradYTmp += gradStride;
729
      srcTmp   += srcStride;
730
    }
731
    
732
    {
733
      gradXTmp = gradX + gradStride;
734
      gradYTmp = gradY + gradStride;
735
      srcTmp   = src   + srcStride;
736
      
737
      Pel * src0 = gradXTmp;
738
      Pel * src1 = gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1) * gradStride;
739
      Pel * src2 = gradYTmp;
740
      Pel * src3 = gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1) * gradStride;
741
      Pel * src4 = srcTmp;
742
      Pel * src5 = srcTmp + (height - 2 * BIO_EXTEND_SIZE - 1) * srcStride;
743
      
744
      Pel * dst0 = gradXTmp - gradStride;
745
      Pel * dst1 = gradXTmp + (height - 2 * BIO_EXTEND_SIZE) * gradStride;
746
      Pel * dst2 = gradYTmp - gradStride;
747
      Pel * dst3 = gradYTmp + (height - 2 * BIO_EXTEND_SIZE) * gradStride;
748
      Pel * dst4 = srcTmp - srcStride;
749
      Pel * dst5 = srcTmp + (height - 2 * BIO_EXTEND_SIZE) * srcStride;
750
      
751
      int x;
752
      
753
0
      for (x = 0; x < widthInside; x+=16)
754
0
      {
755
0
        __m256i s0 = _mm256_loadu_si256(( __m256i* )(src0 + x));
756
0
        __m256i s1 = _mm256_loadu_si256(( __m256i* )(src1 + x));
757
0
        __m256i s2 = _mm256_loadu_si256(( __m256i* )(src2 + x));
758
0
        __m256i s3 = _mm256_loadu_si256(( __m256i* )(src3 + x));
759
0
        __m256i s4 = _mm256_loadu_si256(( __m256i* )(src4 + x));
760
0
        __m256i s5 = _mm256_loadu_si256(( __m256i* )(src5 + x));
761
0
        _mm256_storeu_si256(( __m256i* )(dst0 + x), s0);
762
0
        _mm256_storeu_si256(( __m256i* )(dst1 + x), s1);
763
0
        _mm256_storeu_si256(( __m256i* )(dst2 + x), s2);
764
0
        _mm256_storeu_si256(( __m256i* )(dst3 + x), s3);
765
0
        _mm256_storeu_si256(( __m256i* )(dst4 + x), s4);
766
0
        _mm256_storeu_si256(( __m256i* )(dst5 + x), s5);
767
0
      }
768
      
769
0
      ((int32_t * )(dst0 + x))[0] = ((int32_t * )(src0 + x))[0];
770
0
      ((int32_t * )(dst1 + x))[0] = ((int32_t * )(src1 + x))[0];
771
0
      ((int32_t * )(dst2 + x))[0] = ((int32_t * )(src2 + x))[0];
772
0
      ((int32_t * )(dst3 + x))[0] = ((int32_t * )(src3 + x))[0];
773
0
      ((int32_t * )(dst4 + x))[0] = ((int32_t * )(src4 + x))[0];
774
0
      ((int32_t * )(dst5 + x))[0] = ((int32_t * )(src5 + x))[0];
775
0
    }
776
0
  }
777
0
  else
778
0
#endif
779
0
  if( PAD && ( widthInside & 7 ) == 0 )
780
0
  {
781
0
    for( int y = 0; y < heightInside; y++ )
782
0
    {
783
0
      for( int x = 0; x < widthInside; x += 8 )
784
0
      {
785
0
        __m128i mmPixTop    = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x - srcStride ) ), shift1 );
786
0
        __m128i mmPixBottom = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x + srcStride ) ), shift1 );
787
0
        __m128i mmPixLeft   = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x - 1 ) ),         shift1 );
788
0
        __m128i mmPixRight  = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x + 1 ) ),         shift1 );
789
790
0
        __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop );
791
0
        __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft );
792
793
0
        _mm_storeu_si128((__m128i *) (gradYTmp + x), mmGradVer);
794
0
        _mm_storeu_si128((__m128i *) (gradXTmp + x), mmGradHor);
795
0
      }
796
797
0
      gradXTmp[widthInside] = gradXTmp[widthInside - 1];
798
0
      gradYTmp[widthInside] = gradYTmp[widthInside - 1];
799
0
      srcTmp  [widthInside] = srcTmp  [widthInside - 1];
800
0
      gradXTmp[-1]          = gradXTmp[0];
801
0
      gradYTmp[-1]          = gradYTmp[0];
802
0
      srcTmp  [-1]          = srcTmp  [0];
803
804
0
      gradXTmp += gradStride;
805
0
      gradYTmp += gradStride;
806
0
      srcTmp += srcStride;
807
0
    }
808
    
809
0
    {
810
0
      gradXTmp = gradX + gradStride;
811
0
      gradYTmp = gradY + gradStride;
812
0
      srcTmp   = src   + srcStride;
813
      
814
0
      Pel * src0 = gradXTmp;
815
0
      Pel * src1 = gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1) * gradStride;
816
0
      Pel * src2 = gradYTmp;
817
0
      Pel * src3 = gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1) * gradStride;
818
0
      Pel * src4 = srcTmp;
819
0
      Pel * src5 = srcTmp + (height - 2 * BIO_EXTEND_SIZE - 1) * srcStride;
820
      
821
0
      Pel * dst0 = gradXTmp - gradStride;
822
0
      Pel * dst1 = gradXTmp + (height - 2 * BIO_EXTEND_SIZE) * gradStride;
823
0
      Pel * dst2 = gradYTmp - gradStride;
824
0
      Pel * dst3 = gradYTmp + (height - 2 * BIO_EXTEND_SIZE) * gradStride;
825
0
      Pel * dst4 = srcTmp - srcStride;
826
0
      Pel * dst5 = srcTmp + (height - 2 * BIO_EXTEND_SIZE) * srcStride;
827
      
828
0
      int x;
829
      
830
0
      for (x = 0; x < widthInside; x+=8)
831
0
      {
832
0
        __m128i s0 = _mm_loadu_si128(( __m128i* )(src0 + x));
833
0
        __m128i s1 = _mm_loadu_si128(( __m128i* )(src1 + x));
834
0
        __m128i s2 = _mm_loadu_si128(( __m128i* )(src2 + x));
835
0
        __m128i s3 = _mm_loadu_si128(( __m128i* )(src3 + x));
836
0
        __m128i s4 = _mm_loadu_si128(( __m128i* )(src4 + x));
837
0
        __m128i s5 = _mm_loadu_si128(( __m128i* )(src5 + x));
838
0
        _mm_storeu_si128(( __m128i* )(dst0 + x), s0);
839
0
        _mm_storeu_si128(( __m128i* )(dst1 + x), s1);
840
0
        _mm_storeu_si128(( __m128i* )(dst2 + x), s2);
841
0
        _mm_storeu_si128(( __m128i* )(dst3 + x), s3);
842
0
        _mm_storeu_si128(( __m128i* )(dst4 + x), s4);
843
0
        _mm_storeu_si128(( __m128i* )(dst5 + x), s5);
844
0
      }
845
      
846
0
      ((int32_t * )(dst0 + x))[0] = ((int32_t * )(src0 + x))[0];
847
0
      ((int32_t * )(dst1 + x))[0] = ((int32_t * )(src1 + x))[0];
848
0
      ((int32_t * )(dst2 + x))[0] = ((int32_t * )(src2 + x))[0];
849
0
      ((int32_t * )(dst3 + x))[0] = ((int32_t * )(src3 + x))[0];
850
0
      ((int32_t * )(dst4 + x))[0] = ((int32_t * )(src4 + x))[0];
851
0
      ((int32_t * )(dst5 + x))[0] = ((int32_t * )(src5 + x))[0];
852
0
    }
853
0
  }
854
0
  else
855
0
  {
856
0
    CHECK( widthInside != 4, "Width needs to be '4'!" );
857
858
0
    for( int y = 0; y < ( PAD ? heightInside : 4 ); y++ )
859
0
    {
860
0
      __m128i mmPixTop    = _mm_srai_epi16( _mm_loadu_si64( ( __m128i* )( srcTmp - srcStride ) ), shift1 );
861
0
      __m128i mmPixBottom = _mm_srai_epi16( _mm_loadu_si64( ( __m128i* )( srcTmp + srcStride ) ), shift1 );
862
0
      __m128i mmPixLeft   = _mm_srai_epi16( _mm_loadu_si64( ( __m128i* )( srcTmp - 1 ) ),         shift1 );
863
0
      __m128i mmPixRight  = _mm_srai_epi16( _mm_loadu_si64( ( __m128i* )( srcTmp + 1 ) ),         shift1 );
864
865
0
      __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop );
866
0
      __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft );
867
868
0
      _mm_storeu_si64( ( __m128i * )( gradYTmp ), mmGradVer );
869
0
      _mm_storeu_si64( ( __m128i * )( gradXTmp ), mmGradHor );
870
871
0
      if( PAD )
872
0
      {
873
0
        gradXTmp[widthInside] = gradXTmp[widthInside - 1];
874
0
        gradYTmp[widthInside] = gradYTmp[widthInside - 1];
875
0
        srcTmp  [widthInside] = srcTmp  [widthInside - 1];
876
0
        gradXTmp[-1]          = gradXTmp[0];
877
0
        gradYTmp[-1]          = gradYTmp[0];
878
0
        srcTmp  [-1]          = srcTmp  [0];
879
0
      }
880
881
0
      gradXTmp += gradStride;
882
0
      gradYTmp += gradStride;
883
0
      srcTmp   += srcStride;
884
      
885
0
      if( PAD )
886
0
      {
887
0
        gradXTmp = gradX + gradStride;
888
0
        gradYTmp = gradY + gradStride;
889
0
        srcTmp   = src   + srcStride;
890
        
891
0
        ::memcpy( gradXTmp + heightInside * gradStride, gradXTmp + ( heightInside - 1 ) * gradStride, sizeof( int16_t ) * ( width ) );
892
0
        ::memcpy( gradYTmp + heightInside * gradStride, gradYTmp + ( heightInside - 1 ) * gradStride, sizeof( int16_t ) * ( width ) );
893
0
        ::memcpy( srcTmp   + heightInside * srcStride , srcTmp   + ( heightInside - 1 ) * srcStride , sizeof( int16_t ) * ( width ) );
894
0
        ::memcpy( gradXTmp - gradStride, gradXTmp, sizeof( int16_t ) * ( width ) );
895
0
        ::memcpy( gradYTmp - gradStride, gradYTmp, sizeof( int16_t ) * ( width ) );
896
0
        ::memcpy( srcTmp   - srcStride , srcTmp  , sizeof( int16_t ) * ( width ) );
897
0
      }
898
0
    }
899
0
  }
900
#if USE_AVX2
901
902
0
  _mm256_zeroupper();
903
0
#endif
904
0
}
Unexecuted instantiation: void vvdec::gradFilter_SSE<(vvdec::x86_simd::X86_VEXT)1, true>(short*, long, int, int, long, short*, short*, int)
Unexecuted instantiation: void vvdec::gradFilter_SSE<(vvdec::x86_simd::X86_VEXT)1, false>(short*, long, int, int, long, short*, short*, int)
Unexecuted instantiation: void vvdec::gradFilter_SSE<(vvdec::x86_simd::X86_VEXT)4, true>(short*, long, int, int, long, short*, short*, int)
Unexecuted instantiation: void vvdec::gradFilter_SSE<(vvdec::x86_simd::X86_VEXT)4, false>(short*, long, int, int, long, short*, short*, int)
905
906
template<X86_VEXT vext>
907
void prefetchPadC_SSE( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height )
908
0
{
909
0
  _mm_prefetch( ( const char* )  src,            _MM_HINT_T0 );
910
0
  _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
911
912
0
  if( width == 7 )
913
0
  {
914
0
    const __m128i sl = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 13 );
915
916
0
    __m128i l = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) src ), sl );
917
0
    Pel pel0 = *src;
918
919
0
    dst[-1-dstStride] = pel0;
920
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride ), l );
921
                                                         
922
0
    dst[-1] = pel0;                                      
923
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride ), l );
924
925
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
926
0
    {
927
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
928
929
0
      l = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) src ), sl );
930
0
      pel0 = *src;
931
932
0
      dst[-1] = pel0;
933
0
      _mm_storeu_si128( ( __m128i* ) ( dst ), l );
934
0
    }
935
936
0
    dst[-1] = pel0;
937
0
    _mm_storeu_si128( ( __m128i* ) ( dst ), l );
938
0
  }
939
0
  else
940
0
  {
941
0
    const __m128i sl = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1 );
942
943
0
    __m128i l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[0] );
944
0
    __m128i l1 = _mm_shuffle_epi8( _mm_loadu_si64 ( ( const __m128i* ) &src[8] ), sl );
945
0
    Pel pel0 = *src;
946
947
0
    dst[-1 - dstStride] = pel0;
948
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride - 0 ), l0 );
949
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 1 * dstStride + 8 ), l1 );
950
951
0
    dst[-1] = pel0;
952
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride - 0 ), l0 );
953
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 0 * dstStride + 8 ), l1 );
954
955
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
956
0
    {
957
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
958
      
959
0
      l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[0] );
960
0
      l1 = _mm_shuffle_epi8( _mm_loadu_si64 ( ( const __m128i* ) &src[8] ), sl );
961
0
      pel0 = *src;
962
963
0
      dst[-1] = pel0;
964
0
      _mm_storeu_si128( ( __m128i* ) ( dst - 0 ), l0 );
965
0
      _mm_storeu_si64 ( ( __m128i* ) ( dst + 8 ), l1 );
966
0
    }
967
968
0
    dst[-1] = pel0;
969
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 ), l0 );
970
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst + 8 ), l1 );
971
0
  }
972
0
}
Unexecuted instantiation: void vvdec::prefetchPadC_SSE<(vvdec::x86_simd::X86_VEXT)1>(short const*, long, short*, long, int, int)
Unexecuted instantiation: void vvdec::prefetchPadC_SSE<(vvdec::x86_simd::X86_VEXT)4>(short const*, long, short*, long, int, int)
973
974
template<X86_VEXT vext>
975
void prefetchPadL_SSE( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height )
976
0
{
977
0
  _mm_prefetch( ( const char* )  src,            _MM_HINT_T0 );
978
0
  _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
979
980
0
  if( width == 15 )
981
0
  {
982
0
    const __m128i sb = _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 );
983
984
0
    __m128i l0 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[ 0] ), sb );
985
0
    __m128i l1 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 6] );
986
0
    __m128i l3 =                   _mm_set1_epi16 (                     src[14] );
987
988
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride -  2 ), l0 );
989
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride +  6 ), l1 );
990
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 2 * dstStride + 14 ), l3 );
991
    
992
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride -  2 ), l0 );
993
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride +  6 ), l1 );
994
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 1 * dstStride + 14 ), l3 );
995
    
996
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride -  2 ), l0 );
997
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride +  6 ), l1 );
998
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 0 * dstStride + 14 ), l3 );
999
1000
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
1001
0
    {
1002
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
1003
      
1004
0
      l0 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[ 0] ), sb );
1005
0
      l1 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 6] );
1006
0
      l3 =                   _mm_set1_epi16 (                     src[14] );
1007
      
1008
0
      _mm_storeu_si128( ( __m128i* ) ( dst -  2 ), l0 );
1009
0
      _mm_storeu_si128( ( __m128i* ) ( dst +  6 ), l1 );
1010
0
      _mm_storeu_si64 ( ( __m128i* ) ( dst + 14 ), l3 );
1011
0
    }
1012
1013
0
    _mm_storeu_si128( ( __m128i* ) ( dst -  2 ), l0 );
1014
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  6 ), l1 );
1015
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst + 14 ), l3 );
1016
1017
0
    dst += dstStride;
1018
    
1019
0
    _mm_storeu_si128( ( __m128i* ) ( dst -  2 ), l0 );
1020
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  6 ), l1 );
1021
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst + 14 ), l3 );
1022
0
  }
1023
0
  else
1024
0
  {
1025
0
    const __m128i sb = _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 );
1026
1027
0
    __m128i l0 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[ 0] ), sb );
1028
0
    __m128i l1 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 6] );
1029
0
    __m128i l2 =                   _mm_loadu_si128( ( const __m128i* ) &src[14] );
1030
0
    __m128i l3 =                   _mm_set1_epi16 (                     src[22] );
1031
1032
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride -  2 ), l0 );
1033
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride +  6 ), l1 );
1034
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride + 14 ), l2 );
1035
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 2 * dstStride + 22 ), l3 );
1036
    
1037
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride -  2 ), l0 );
1038
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride +  6 ), l1 );
1039
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride + 14 ), l2 );
1040
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 1 * dstStride + 22 ), l3 );
1041
    
1042
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride -  2 ), l0 );
1043
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride +  6 ), l1 );
1044
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride + 14 ), l2 );
1045
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst - 0 * dstStride + 22 ), l3 );
1046
1047
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
1048
0
    {
1049
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
1050
      
1051
0
      l0 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[ 0] ), sb );
1052
0
      l1 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 6] );
1053
0
      l2 =                   _mm_loadu_si128( ( const __m128i* ) &src[14] );
1054
0
      l3 =                   _mm_set1_epi16 (                     src[22] );
1055
      
1056
0
      _mm_storeu_si128( ( __m128i* ) ( dst -  2 ), l0 );
1057
0
      _mm_storeu_si128( ( __m128i* ) ( dst +  6 ), l1 );
1058
0
      _mm_storeu_si128( ( __m128i* ) ( dst + 14 ), l2 );
1059
0
      _mm_storeu_si64 ( ( __m128i* ) ( dst + 22 ), l3 );
1060
0
    }
1061
1062
0
    _mm_storeu_si128( ( __m128i* ) ( dst -  2 ), l0 );
1063
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  6 ), l1 );
1064
0
    _mm_storeu_si128( ( __m128i* ) ( dst + 14 ), l2 );
1065
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst + 22 ), l3 );
1066
1067
0
    dst += dstStride;
1068
    
1069
0
    _mm_storeu_si128( ( __m128i* ) ( dst -  2 ), l0 );
1070
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  6 ), l1 );
1071
0
    _mm_storeu_si128( ( __m128i* ) ( dst + 14 ), l2 );
1072
0
    _mm_storeu_si64 ( ( __m128i* ) ( dst + 22 ), l3 );
1073
0
  }
1074
0
}
1075
1076
#if USE_AVX2
1077
template<>
1078
void prefetchPadL_SSE<AVX2>( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height )
1079
0
{
1080
0
  _mm_prefetch( ( const char* ) src, _MM_HINT_T0 );
1081
0
  _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
1082
1083
0
  if( width == 15 )
1084
0
  {
1085
0
    const __m256i sl = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1086
0
                                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 );
1087
1088
0
    __m128i l0 = _mm_set1_epi16( src[0] );
1089
0
    __m256i l1 = _mm256_shuffle_epi8( _mm256_loadu_si256( ( const __m256i* ) &src[1] ), sl );
1090
1091
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 2 * dstStride -  2 ), l0 );
1092
0
    _mm256_storeu_si256( ( __m256i* ) ( dst - 2 * dstStride +  1 ), l1 );
1093
1094
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 1 * dstStride -  2 ), l0 );
1095
0
    _mm256_storeu_si256( ( __m256i* ) ( dst - 1 * dstStride +  1 ), l1 );
1096
1097
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 0 * dstStride -  2 ), l0 );
1098
0
    _mm256_storeu_si256( ( __m256i* ) ( dst - 0 * dstStride +  1 ), l1 );
1099
1100
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
1101
0
    {
1102
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
1103
1104
0
      l0 = _mm_set1_epi16( src[0] );
1105
0
      l1 = _mm256_shuffle_epi8( _mm256_loadu_si256( ( const __m256i* ) & src[1] ), sl );
1106
1107
0
      _mm_storeu_si64    ( ( __m128i* ) ( dst - 2 ), l0 );
1108
0
      _mm256_storeu_si256( ( __m256i* ) ( dst + 1 ), l1 );
1109
0
    }
1110
    
1111
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 2 ), l0 );
1112
0
    _mm256_storeu_si256( ( __m256i* ) ( dst + 1 ), l1 );
1113
1114
0
    dst += dstStride;
1115
    
1116
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 2 ), l0 );
1117
0
    _mm256_storeu_si256( ( __m256i* ) ( dst + 1 ), l1 );
1118
0
  }
1119
0
  else
1120
0
  {
1121
0
    const __m128i sl = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 );
1122
1123
0
    __m128i l0 =                      _mm_set1_epi16(                         src[ 0] );
1124
0
    __m256i l1 =                      _mm256_loadu_si256( ( const __m256i* ) &src[ 1] );
1125
0
    __m128i l2 = _mm_shuffle_epi8   ( _mm_loadu_si128   ( ( const __m128i* ) &src[17] ), sl );
1126
1127
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 2 * dstStride -  2 ), l0 );
1128
0
    _mm256_storeu_si256( ( __m256i* ) ( dst - 2 * dstStride +  1 ), l1 );
1129
0
    _mm_storeu_si128   ( ( __m128i* ) ( dst - 2 * dstStride + 17 ), l2 );
1130
1131
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 1 * dstStride -  2 ), l0 );
1132
0
    _mm256_storeu_si256( ( __m256i* ) ( dst - 1 * dstStride +  1 ), l1 );
1133
0
    _mm_storeu_si128   ( ( __m128i* ) ( dst - 1 * dstStride + 17 ), l2 );
1134
1135
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst - 0 * dstStride -  2 ), l0 );
1136
0
    _mm256_storeu_si256( ( __m256i* ) ( dst - 0 * dstStride +  1 ), l1 );
1137
0
    _mm_storeu_si128   ( ( __m128i* ) ( dst - 0 * dstStride + 17 ), l2 );
1138
1139
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
1140
0
    {
1141
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
1142
      
1143
0
      l0 =                      _mm_set1_epi16(                         src[ 0] );
1144
0
      l1 =                      _mm256_loadu_si256( ( const __m256i* ) &src[ 1] );
1145
0
      l2 = _mm_shuffle_epi8   ( _mm_loadu_si128   ( ( const __m128i* ) &src[17] ), sl );
1146
1147
0
      _mm_storeu_si64    ( ( __m128i* ) ( dst -  2 ), l0 );
1148
0
      _mm256_storeu_si256( ( __m256i* ) ( dst +  1 ), l1 );
1149
0
      _mm_storeu_si128   ( ( __m128i* ) ( dst + 17 ), l2 );
1150
0
    }
1151
    
1152
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst -  2 ), l0 );
1153
0
    _mm256_storeu_si256( ( __m256i* ) ( dst +  1 ), l1 );
1154
0
    _mm_storeu_si128   ( ( __m128i* ) ( dst + 17 ), l2 );
1155
1156
0
    dst += dstStride;
1157
    
1158
0
    _mm_storeu_si64    ( ( __m128i* ) ( dst -  2 ), l0 );
1159
0
    _mm256_storeu_si256( ( __m256i* ) ( dst +  1 ), l1 );
1160
0
    _mm_storeu_si128   ( ( __m128i* ) ( dst + 17 ), l2 );
1161
0
  }
1162
1163
0
  _mm256_zeroupper();
1164
0
}
1165
#endif
1166
1167
template<X86_VEXT vext>
1168
void InterPrediction::_initInterPredictionX86()
1169
0
{
1170
0
  BiOptFlow       = BiOptFlowCoreSIMD  <vext>;
1171
0
  PaddBIO         = PaddBIO_SIMD       <vext>;
1172
0
#if !defined( REAL_TARGET_WASM ) // profilings show those functions are slower with WASM SIMD emulation than C++->WASM
1173
0
  BioGradFilter   = gradFilter_SSE     <vext, true>;
1174
0
  profGradFilter  = gradFilter_SSE     <vext, false>;
1175
0
#endif
1176
0
  applyPROF[0]    = applyPROF_SSE      <vext, false>;
1177
0
  applyPROF[1]    = applyPROF_SSE      <vext, true>;
1178
0
  roundIntVector  = roundIntVector_SIMD<vext>;
1179
0
  prefetchPad[0]  = prefetchPadL_SSE   <vext>;
1180
0
  prefetchPad[2]  = prefetchPadC_SSE   <vext>;
1181
0
}
Unexecuted instantiation: void vvdec::InterPrediction::_initInterPredictionX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::InterPrediction::_initInterPredictionX86<(vvdec::x86_simd::X86_VEXT)4>()
1182
template void InterPrediction::_initInterPredictionX86<SIMDX86>();
1183
1184
#endif // TARGET_SIMD_X86
1185
#endif
1186
1187
}