Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/InterPredX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     InterPredX86.h
43
    \brief    SIMD for InterPrediction
44
*/
45
46
//! \ingroup CommonLib
47
//! \{
48
49
50
//#include "CommonLib/CommonDef.h"
51
#include "CommonDefX86.h"
52
#include "Rom.h"
53
#include "InterPrediction.h"
54
55
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_BDOF
56
57
//! \ingroup CommonLib
58
//! \{
59
60
namespace vvenc {
61
62
63
64
static inline int rightShiftMSB(int numer, int denom)
65
0
{
66
0
  int shiftIdx = bit_scan_reverse(denom);
67
0
  return (numer >> shiftIdx);
68
0
}
Unexecuted instantiation: InterPred_sse41.cpp:vvenc::rightShiftMSB(int, int)
Unexecuted instantiation: InterPred_avx2.cpp:vvenc::rightShiftMSB(int, int)
69
  
70
template<X86_VEXT vext>
71
static inline void addBIOAvg4_SSE(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
72
0
{
73
0
  const ptrdiff_t src0Stride = widthG + 2;
74
0
  const ptrdiff_t src1Stride = widthG + 2;
75
0
  const ptrdiff_t gradStride = widthG;
76
77
0
  __m128i mm_tmpx    = _mm_set1_epi32( ( tmpx & 0xffff ) | ( tmpy << 16 ) );
78
0
  __m128i mm_offset  = _mm_set1_epi32( offset );
79
0
  __m128i vibdimin   = _mm_set1_epi16( clpRng.min() );
80
0
  __m128i vibdimax   = _mm_set1_epi16( clpRng.max() );
81
0
  __m128i mm_a;
82
0
  __m128i mm_b;
83
0
  __m128i mm_sum;
84
85
0
  for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
86
0
  {
87
0
    mm_a   = _mm_unpacklo_epi16 ( _vv_loadl_epi64( (const __m128i *) gradX0 ), _vv_loadl_epi64( (const __m128i *) gradY0 ) );
88
0
    mm_b   = _mm_unpacklo_epi16 ( _vv_loadl_epi64( (const __m128i *) gradX1 ), _vv_loadl_epi64( (const __m128i *) gradY1 ) );
89
0
    mm_a   = _mm_sub_epi16      ( mm_a, mm_b );
90
0
    mm_sum = _mm_madd_epi16     ( mm_a, mm_tmpx );
91
0
    mm_a   = _mm_cvtepi16_epi32 ( _vv_loadl_epi64( (const __m128i *) ( src0 ) ) );
92
0
    mm_b   = _mm_cvtepi16_epi32 ( _vv_loadl_epi64( (const __m128i *) ( src1 ) ) );
93
0
    mm_sum = _mm_add_epi32      ( _mm_add_epi32( mm_sum, mm_a ), _mm_add_epi32( mm_b, mm_offset ) );
94
0
    mm_sum = _mm_packs_epi32    ( _mm_srai_epi32( mm_sum, shift ), mm_a );
95
0
    mm_sum = _mm_min_epi16      ( vibdimax, _mm_max_epi16( vibdimin, mm_sum ) );
96
0
    _vv_storel_epi64            ( (__m128i *) dst, mm_sum );
97
0
  }
98
0
}
99
100
#if USE_AVX2
101
static inline void addBIOAvg4_2x_AVX2(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx0, int tmpx1, int tmpy0, int tmpy1, int shift, int offset, const ClpRng& clpRng)
102
0
{
103
0
  const ptrdiff_t src0Stride = widthG + 2;
104
0
  const ptrdiff_t src1Stride = widthG + 2;
105
0
  const ptrdiff_t gradStride = widthG;
106
107
0
  __m256i mm_tmpx    = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( ( tmpx0 & 0xffff ) | ( tmpy0 * ( 1 << 16 )) ) ), _mm_set1_epi32( ( tmpx1 & 0xffff ) | ( tmpy1 * ( 1 << 16 )) ), 1 );
108
0
  __m256i mm_offset  = _mm256_set1_epi32( offset );
109
0
  __m256i vibdimin   = _mm256_set1_epi32( clpRng.min() );
110
0
  __m256i vibdimax   = _mm256_set1_epi32( clpRng.max() );
111
0
  __m256i mm_a;
112
0
  __m256i mm_b;
113
0
  __m256i mm_sum;
114
0
  __m128i xsrc0, xsrc1;
115
116
0
  for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
117
0
  {
118
0
    xsrc0  = _mm_loadu_si128       ( ( const __m128i * ) gradX0 );
119
0
    xsrc1  = _mm_loadu_si128       ( ( const __m128i * ) gradY0 );
120
0
    mm_a   = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
121
0
    mm_a   = _mm256_inserti128_si256( mm_a, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
122
0
    xsrc0  = _mm_loadu_si128       ( ( const __m128i * ) gradX1 );
123
0
    xsrc1  = _mm_loadu_si128       ( ( const __m128i * ) gradY1 );
124
0
    mm_b   = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
125
0
    mm_b   = _mm256_inserti128_si256( mm_b, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
126
0
    mm_a   = _mm256_sub_epi16      ( mm_a, mm_b );
127
0
    mm_sum = _mm256_madd_epi16     ( mm_a, mm_tmpx );
128
0
    mm_a   = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src0 ) ) );
129
0
    mm_b   = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src1 ) ) );
130
0
    mm_sum = _mm256_add_epi32      ( _mm256_add_epi32( mm_sum, mm_a ), _mm256_add_epi32( mm_b, mm_offset ) );
131
0
    mm_sum = _mm256_srai_epi32     ( mm_sum, shift );
132
0
    mm_sum = _mm256_min_epi32      ( vibdimax, _mm256_max_epi32( vibdimin, mm_sum ) );
133
0
    _mm_storeu_si128               ( (__m128i *) dst, _mm256_cvtepi32_epi16x( mm_sum ) );
134
0
  }
135
0
}
136
#endif
137
138
template< X86_VEXT vext >
139
static inline void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx, int &tmpy)
140
0
{
141
0
  static constexpr int shift4 = 4;
142
0
  static constexpr int shift5 = 1;
143
0
  const int srcStride = widthG + 2;
144
145
0
  __m128i sumAbsGXTmp    = _mm_setzero_si128();
146
0
  __m128i sumDIXTmp      = _mm_setzero_si128();
147
0
  __m128i sumAbsGYTmp    = _mm_setzero_si128();
148
0
  __m128i sumDIYTmp      = _mm_setzero_si128();
149
0
  __m128i sumSignGyGxTmp = _mm_setzero_si128();
150
151
0
  for (int y = 0; y < 6; y++)
152
0
  {
153
0
    __m128i shiftSrcY0Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Tmp)), shift4);
154
0
    __m128i shiftSrcY1Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Tmp)), shift4);
155
0
    __m128i loadGradX0    = _mm_loadu_si128((__m128i*)(gradX0));
156
0
    __m128i loadGradX1    = _mm_loadu_si128((__m128i*)(gradX1));
157
0
    __m128i loadGradY0    = _mm_loadu_si128((__m128i*)(gradY0));
158
0
    __m128i loadGradY1    = _mm_loadu_si128((__m128i*)(gradY1));
159
0
    __m128i subTemp1      = _mm_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
160
0
    __m128i packTempX     = _mm_srai_epi16(_mm_add_epi16(loadGradX0, loadGradX1), shift5);
161
0
    __m128i packTempY     = _mm_srai_epi16(_mm_add_epi16(loadGradY0, loadGradY1), shift5);
162
0
    __m128i gX            = _mm_abs_epi16(packTempX);
163
0
    __m128i gY            = _mm_abs_epi16(packTempY);
164
0
    __m128i dIX           = _mm_sign_epi16(subTemp1,  packTempX);
165
0
    __m128i dIY           = _mm_sign_epi16(subTemp1,  packTempY);
166
0
    __m128i signGY_GX     = _mm_sign_epi16(packTempX, packTempY);
167
168
0
    sumAbsGXTmp     = _mm_add_epi16(sumAbsGXTmp, gX);
169
0
    sumDIXTmp       = _mm_add_epi16(sumDIXTmp, dIX);
170
0
    sumAbsGYTmp     = _mm_add_epi16(sumAbsGYTmp, gY);
171
0
    sumDIYTmp       = _mm_add_epi16(sumDIYTmp, dIY);
172
0
    sumSignGyGxTmp  = _mm_add_epi16(sumSignGyGxTmp, signGY_GX);
173
174
0
    srcY0Tmp += srcStride;
175
0
    srcY1Tmp += srcStride;
176
0
    gradX0 += widthG;
177
0
    gradX1 += widthG;
178
0
    gradY0 += widthG;
179
0
    gradY1 += widthG;
180
0
  }
181
182
0
  sumAbsGXTmp    = _mm_madd_epi16(sumAbsGXTmp,    _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
183
0
  sumDIXTmp      = _mm_madd_epi16(sumDIXTmp,      _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
184
0
  sumAbsGYTmp    = _mm_madd_epi16(sumAbsGYTmp,    _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
185
0
  sumDIYTmp      = _mm_madd_epi16(sumDIYTmp,      _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
186
0
  sumSignGyGxTmp = _mm_madd_epi16(sumSignGyGxTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
187
188
0
  __m128i a12 = _mm_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
189
0
  __m128i a3  = _mm_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
190
0
  __m128i b12 = _mm_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
191
0
  __m128i b3  = _mm_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
192
0
  __m128i c1  = _mm_unpacklo_epi64(a12, b12);
193
0
  __m128i c2  = _mm_unpackhi_epi64(a12, b12);
194
0
  __m128i c3  = _mm_unpacklo_epi64(a3, b3);
195
196
0
  c1 = _mm_add_epi32(c1, c2);
197
0
  c1 = _mm_add_epi32(c1, c3);
198
199
0
  int sumAbsGX = _mm_cvtsi128_si32(c1);
200
0
  int sumAbsGY = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0x55));
201
0
  int sumDIX   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xaa));
202
0
  int sumDIY   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xff));
203
204
0
  sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
205
0
  sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
206
0
  int sumSignGY_GX  = _mm_cvtsi128_si32(sumSignGyGxTmp);
207
208
0
  tmpx = sumAbsGX == 0 ? 0 : rightShiftMSB( sumDIX << 2, sumAbsGX );
209
0
  tmpx = Clip3( -limit, limit, tmpx );
210
211
0
  int mainsGxGy = sumSignGY_GX >> 12;
212
0
  int secsGxGy  = sumSignGY_GX & ( ( 1 << 12 ) - 1 );
213
0
  int tmpData   = tmpx * mainsGxGy;
214
0
  tmpData       = ( ( tmpData << 12 ) + tmpx * secsGxGy ) >> 1;
215
0
  tmpy = sumAbsGY == 0 ? 0 : rightShiftMSB( ( ( sumDIY << 2 ) - tmpData ), sumAbsGY );
216
0
  tmpy = Clip3( -limit, limit, tmpy );
217
0
}
218
219
#if USE_AVX2
220
static inline void calcBIOSums2x_AVX2(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx0, int &tmpx1, int &tmpy0, int &tmpy1 )
221
0
{
222
0
  static constexpr int shift4 = 4;
223
0
  static constexpr int shift5 = 1;
224
0
  const int srcStride = widthG + 2;
225
  
226
0
  __m256i sumAbsGXTmp     = _mm256_setzero_si256();
227
0
  __m256i sumDIXTmp       = _mm256_setzero_si256();
228
0
  __m256i sumAbsGYTmp     = _mm256_setzero_si256();
229
0
  __m256i sumDIYTmp       = _mm256_setzero_si256();
230
0
  __m256i sumSignGyGxTmp  = _mm256_setzero_si256();
231
232
0
#define _mm256_load2_si128_offset4(addr) _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*) &addr[0])), _mm_loadu_si128((const __m128i*) &addr[4]), 1 )
233
234
0
  for (int y = 0; y < 6; y++)
235
0
  {
236
0
    __m256i shiftSrcY0Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY0Tmp), shift4);
237
0
    __m256i shiftSrcY1Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY1Tmp), shift4);
238
0
    __m256i loadGradX0    = _mm256_load2_si128_offset4(gradX0);
239
0
    __m256i loadGradX1    = _mm256_load2_si128_offset4(gradX1);
240
0
    __m256i loadGradY0    = _mm256_load2_si128_offset4(gradY0);
241
0
    __m256i loadGradY1    = _mm256_load2_si128_offset4(gradY1);
242
0
    __m256i subTemp1      = _mm256_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
243
0
    __m256i packTempX     = _mm256_srai_epi16(_mm256_add_epi16(loadGradX0, loadGradX1), shift5);
244
0
    __m256i packTempY     = _mm256_srai_epi16(_mm256_add_epi16(loadGradY0, loadGradY1), shift5);
245
0
    __m256i gX            = _mm256_abs_epi16(packTempX);
246
0
    __m256i gY            = _mm256_abs_epi16(packTempY);
247
0
    __m256i dIX           = _mm256_sign_epi16(subTemp1,  packTempX );
248
0
    __m256i dIY           = _mm256_sign_epi16(subTemp1,  packTempY );
249
0
    __m256i signGY_GX     = _mm256_sign_epi16(packTempX, packTempY );
250
251
0
    sumAbsGXTmp     = _mm256_add_epi16(sumAbsGXTmp, gX);
252
0
    sumDIXTmp       = _mm256_add_epi16(sumDIXTmp, dIX);
253
0
    sumAbsGYTmp     = _mm256_add_epi16(sumAbsGYTmp, gY);
254
0
    sumDIYTmp       = _mm256_add_epi16(sumDIYTmp, dIY);
255
0
    sumSignGyGxTmp  = _mm256_add_epi16(sumSignGyGxTmp, signGY_GX);
256
257
0
    srcY0Tmp += srcStride;
258
0
    srcY1Tmp += srcStride;
259
0
    gradX0 += widthG;
260
0
    gradX1 += widthG;
261
0
    gradY0 += widthG;
262
0
    gradY1 += widthG;
263
0
  }
264
265
0
#undef _mm256_load2_si128_offset4
266
267
0
  sumAbsGXTmp    = _mm256_madd_epi16(sumAbsGXTmp,    _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
268
0
  sumDIXTmp      = _mm256_madd_epi16(sumDIXTmp,      _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
269
0
  sumAbsGYTmp    = _mm256_madd_epi16(sumAbsGYTmp,    _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
270
0
  sumDIYTmp      = _mm256_madd_epi16(sumDIYTmp,      _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
271
0
  sumSignGyGxTmp = _mm256_madd_epi16(sumSignGyGxTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
272
273
0
  __m256i a12 = _mm256_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
274
0
  __m256i a3  = _mm256_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
275
0
  __m256i b12 = _mm256_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
276
0
  __m256i b3  = _mm256_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
277
0
  __m256i c1  = _mm256_unpacklo_epi64(a12, b12);
278
0
  __m256i c2  = _mm256_unpackhi_epi64(a12, b12);
279
0
  __m256i c3  = _mm256_unpacklo_epi64(a3, b3);
280
281
0
  c1 = _mm256_add_epi32(c1, c2);
282
0
  c1 = _mm256_add_epi32(c1, c3);
283
284
0
  int tmpData[8];
285
286
0
  _mm256_storeu_si256( ( __m256i* ) &tmpData[0], c1 );
287
288
0
  #define sumAbsGX0 tmpData[0]
289
0
  #define sumAbsGX1 tmpData[4]
290
291
0
  #define sumAbsGY0 tmpData[1]
292
0
  #define sumAbsGY1 tmpData[5]
293
294
0
  #define sumDIX0   tmpData[2]
295
0
  #define sumDIX1   tmpData[6]
296
297
0
  #define sumDIY0   tmpData[3]
298
0
  #define sumDIY1   tmpData[7]
299
300
0
  sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
301
0
  sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
302
303
0
  int sumSignGY_GX0 = _mm256_extract_epi32( sumSignGyGxTmp, 0 );
304
0
  int sumSignGY_GX1 = _mm256_extract_epi32( sumSignGyGxTmp, 4 );
305
306
#if 0
307
  tmpx0 = sumAbsGX0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX0 << 2, sumAbsGX0 ) );
308
  tmpx1 = sumAbsGX1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX1 << 2, sumAbsGX1 ) );
309
  __m128i vtmpx         = _mm_setr_epi32 ( tmpx0, tmpx1, 0, 0 );
310
  __m128i vsumSignGY_GX = _mm_setr_epi32 ( sumSignGY_GX0, sumSignGY_GX1, 0, 0 );
311
  __m128i vmainsGxGy    = _mm_srai_epi32 ( vsumSignGY_GX, 12 );
312
  __m128i vsecsGxGy     = _mm_and_si128  ( vsumSignGY_GX, _mm_set1_epi32( ( 1 << 12 ) - 1 ) );
313
  __m128i vtmpData      = _mm_mullo_epi32( vtmpx, vmainsGxGy );
314
  vtmpData              = _mm_slli_epi32 ( vtmpData, 12 );
315
  vtmpData              = _mm_add_epi32  ( vtmpData, _mm_mullo_epi32( vtmpx, vsecsGxGy ) );
316
  vtmpData              = _mm_srai_epi32 ( vtmpData, 1 );
317
  __m128i vtmpyIn       = _mm_slli_epi32 ( _mm_setr_epi32( sumDIY0, sumDIY1, 0, 0 ), 2 );
318
  vtmpyIn               = _mm_sub_epi32  ( vtmpyIn, vtmpData );
319
320
  tmpy0 = sumAbsGY0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 0 ), sumAbsGY0 ) );
321
  tmpy1 = sumAbsGY1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 1 ), sumAbsGY1 ) );
322
#else
323
0
  tmpx0 = sumAbsGX0 == 0 ? 0 : rightShiftMSB( sumDIX0 *4, sumAbsGX0 );
324
0
  tmpx0 = Clip3( -limit, limit, tmpx0 );
325
326
0
  int mainsGxGy0 = sumSignGY_GX0 >> 12;
327
0
  int secsGxGy0  = sumSignGY_GX0 & ( ( 1 << 12 ) - 1 );
328
0
  int tmpData0   = tmpx0 * mainsGxGy0;
329
0
  tmpData0       = ( ( tmpData0 * ( 1 << 12 )) + tmpx0 * secsGxGy0 ) >> 1;
330
0
  tmpy0 = sumAbsGY0 == 0 ? 0 : rightShiftMSB( ( ( sumDIY0 *4) - tmpData0 ), sumAbsGY0 );
331
0
  tmpy0 = Clip3( -limit, limit, tmpy0 );
332
333
334
0
  tmpx1 = sumAbsGX1 == 0 ? 0 : rightShiftMSB( sumDIX1 *4, sumAbsGX1 );
335
0
  tmpx1 = Clip3( -limit, limit, tmpx1 );
336
337
0
  int mainsGxGy1 = sumSignGY_GX1 >> 12;
338
0
  int secsGxGy1  = sumSignGY_GX1 & ( ( 1 << 12 ) - 1 );
339
0
  int tmpData1   = tmpx1 * mainsGxGy1;
340
0
  tmpData1 = ( ( tmpData1 * ( 1 << 12 )) + tmpx1 * secsGxGy1 ) >> 1;
341
0
  tmpy1 = sumAbsGY1 == 0 ? 0 : rightShiftMSB( ( ( sumDIY1*4 ) - tmpData1 ), sumAbsGY1 );
342
0
  tmpy1 = Clip3( -limit, limit, tmpy1 );
343
0
#endif
344
345
0
#undef sumAbsGX0
346
0
#undef sumAbsGX1
347
0
#undef sumAbsGY0
348
0
#undef sumAbsGY1
349
0
#undef sumDIX0  
350
0
#undef sumDIX1  
351
0
#undef sumDIY0  
352
0
#undef sumDIY1  
353
0
}
354
#endif
355
356
template< X86_VEXT vext>
357
void BiOptFlowCoreSIMD( const Pel* srcY0,
358
                        const Pel* srcY1,
359
                        const Pel* gradX0,
360
                        const Pel* gradX1,
361
                        const Pel* gradY0,
362
                        const Pel* gradY1,
363
                        const int  width,
364
                        const int  height,
365
                              Pel* dstY,
366
                        const ptrdiff_t dstStride,
367
                        const int  shiftNum,
368
                        const int  offset,
369
                        const int  limit,
370
                        const ClpRng& clpRng,
371
                        const int bitDepth )
372
0
{
373
0
  const int widthG        = width  + 2 * BDOF_EXTEND_SIZE;
374
0
  const int stridePredMC  = widthG + 2;
375
0
        int offsetPos     = widthG * BDOF_EXTEND_SIZE + BDOF_EXTEND_SIZE;
376
0
  const int xUnit         = ( width  >> 2 );
377
0
  const int yUnit         = ( height >> 2 );
378
379
0
  const Pel* srcY0Temp;
380
0
  const Pel* srcY1Temp;
381
0
        Pel *dstY0;
382
  
383
0
  int OffPos;
384
0
  int OffPad = 0;
385
386
0
  for( int yu = 0; yu < yUnit; yu++, srcY0 += ( stridePredMC << 2 ), srcY1 += ( stridePredMC << 2 ), dstY += ( dstStride << 2 ), offsetPos += ( widthG << 2 ) )
387
0
  {
388
0
    srcY0Temp = srcY0;
389
0
    srcY1Temp = srcY1;
390
0
    dstY0     = dstY;
391
    
392
0
    OffPos = offsetPos;
393
0
    OffPad = ( ( yu * widthG ) << 2 );
394
395
#if USE_AVX2
396
0
    for( int xu = 0; xu < xUnit; xu += 2, srcY0Temp += 8, srcY1Temp += 8, dstY0 += 8, OffPos += 8, OffPad += 8 )
397
0
    {
398
0
      int tmpx0, tmpy0, tmpx1, tmpy1;
399
400
      //calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0, gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0, stridePredMC, bitDepth, limit, tmpx, tmpy );
401
      //calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0, gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0, stridePredMC, bitDepth, limit, tmpx, tmpy );
402
      calcBIOSums2x_AVX2( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, widthG, bitDepth, limit, tmpx0, tmpx1, tmpy0, tmpy1 );
403
404
      //addBIOAvg4_SSE<vext>( srcY0Temp + stridePredMC + 1 + 0, srcY1Temp + stridePredMC + 1 + 0, dstY0 + 0, dstStride, gradX0 + OffPos + 0, gradX1 + OffPos + 0, gradY0 + OffPos + 0, gradY1 + OffPos + 0, widthG, tmpx0, tmpy0, shiftNum, offset, clpRng );
405
      //addBIOAvg4_SSE<vext>( srcY0Temp + stridePredMC + 1 + 4, srcY1Temp + stridePredMC + 1 + 4, dstY0 + 4, dstStride, gradX0 + OffPos + 4, gradX1 + OffPos + 4, gradY0 + OffPos + 4, gradY1 + OffPos + 4, widthG, tmpx1, tmpy1, shiftNum, offset, clpRng );
406
0
      addBIOAvg4_2x_AVX2( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx0, tmpx1, tmpy0, tmpy1, shiftNum, offset, clpRng );
407
0
    }  // xu
408
#else
409
0
    for( int xu = 0; xu < xUnit; xu++, srcY0Temp += 4, srcY1Temp += 4, dstY0 += 4, OffPos += 4, OffPad += 4 )
410
0
    {
411
0
      int tmpx, tmpy;
412
413
      calcBIOSums_SSE<vext>( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, widthG, bitDepth, limit, tmpx, tmpy );
414
415
0
      addBIOAvg4_SSE<vext> ( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx, tmpy, shiftNum, offset, clpRng );
416
0
    }  // xu
417
#endif
418
0
  }  // yu
419
#if USE_AVX2
420
421
  _mm256_zeroupper();
422
#endif
423
0
}
Unexecuted instantiation: void vvenc::BiOptFlowCoreSIMD<(vvenc::x86_simd::X86_VEXT)1>(short const*, short const*, short const*, short const*, short const*, short const*, int, int, short*, long, int, int, int, vvenc::ClpRng const&, int)
Unexecuted instantiation: void vvenc::BiOptFlowCoreSIMD<(vvenc::x86_simd::X86_VEXT)4>(short const*, short const*, short const*, short const*, short const*, short const*, int, int, short*, long, int, int, int, vvenc::ClpRng const&, int)
424
425
template< X86_VEXT vext, bool PAD = true>
426
void gradFilter_SSE(const Pel* src, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth)
427
0
{
428
0
  const Pel* srcTmp = src + srcStride + 1;
429
0
  Pel* gradXTmp = gradX + gradStride + 1;
430
0
  Pel* gradYTmp = gradY + gradStride + 1;
431
432
0
  int widthInside = width - 2 * BDOF_EXTEND_SIZE;
433
0
  int heightInside = height - 2 * BDOF_EXTEND_SIZE;
434
0
  int shift1 = 6;
435
0
  __m128i mmShift1 = _mm_cvtsi32_si128(shift1);
436
0
  assert((widthInside & 3) == 0);
437
438
0
  if ((widthInside & 7) == 0)
439
0
  {
440
0
    for (int y = 0; y < heightInside; y++)
441
0
    {
442
0
      int x = 0;
443
0
      for (; x < widthInside; x += 8)
444
0
      {
445
0
        __m128i mmPixTop = _mm_sra_epi16(_mm_loadu_si128((__m128i*) (srcTmp + x - srcStride)), mmShift1);
446
0
        __m128i mmPixBottom = _mm_sra_epi16(_mm_loadu_si128((__m128i*) (srcTmp + x + srcStride)), mmShift1);
447
0
        __m128i mmPixLeft = _mm_sra_epi16(_mm_loadu_si128((__m128i*) (srcTmp + x - 1)), mmShift1);
448
0
        __m128i mmPixRight = _mm_sra_epi16(_mm_loadu_si128((__m128i*) (srcTmp + x + 1)), mmShift1);
449
450
0
        __m128i mmGradVer = _mm_sub_epi16(mmPixBottom, mmPixTop);
451
0
        __m128i mmGradHor = _mm_sub_epi16(mmPixRight, mmPixLeft);
452
453
0
        _mm_storeu_si128((__m128i*) (gradYTmp + x), mmGradVer);
454
0
        _mm_storeu_si128((__m128i*) (gradXTmp + x), mmGradHor);
455
0
      }
456
457
0
      gradXTmp += gradStride;
458
0
      gradYTmp += gradStride;
459
0
      srcTmp += srcStride;
460
0
    }
461
0
  }
462
0
  else
463
0
  {
464
0
    __m128i mmPixTop = _mm_sra_epi16(_mm_unpacklo_epi64(_vv_loadl_epi64((__m128i*) (srcTmp - srcStride)), _vv_loadl_epi64((__m128i*) (srcTmp))), mmShift1);
465
0
    for (int y = 0; y < heightInside; y += 2)
466
0
    {
467
0
      __m128i mmPixBottom = _mm_sra_epi16(_mm_unpacklo_epi64(_vv_loadl_epi64((__m128i*) (srcTmp + srcStride)), _vv_loadl_epi64((__m128i*) (srcTmp + (srcStride << 1)))), mmShift1);
468
0
      __m128i mmPixLeft = _mm_sra_epi16(_mm_unpacklo_epi64(_vv_loadl_epi64((__m128i*) (srcTmp - 1)), _vv_loadl_epi64((__m128i*) (srcTmp - 1 + srcStride))), mmShift1);
469
0
      __m128i mmPixRight = _mm_sra_epi16(_mm_unpacklo_epi64(_vv_loadl_epi64((__m128i*) (srcTmp + 1)), _vv_loadl_epi64((__m128i*) (srcTmp + 1 + srcStride))), mmShift1);
470
471
0
      __m128i mmGradVer = _mm_sub_epi16(mmPixBottom, mmPixTop);
472
0
      __m128i mmGradHor = _mm_sub_epi16(mmPixRight, mmPixLeft);
473
474
0
      _vv_storel_epi64((__m128i*) gradYTmp, mmGradVer);
475
0
      _vv_storel_epi64((__m128i*) (gradYTmp + gradStride), _mm_unpackhi_epi64(mmGradVer, mmGradHor));
476
0
      _vv_storel_epi64((__m128i*) gradXTmp, mmGradHor);
477
0
      _vv_storel_epi64((__m128i*) (gradXTmp + gradStride), _mm_unpackhi_epi64(mmGradHor, mmGradVer));
478
479
0
      mmPixTop = mmPixBottom;
480
0
      gradXTmp += gradStride << 1;
481
0
      gradYTmp += gradStride << 1;
482
0
      srcTmp += srcStride << 1;
483
0
    }
484
0
  }
485
486
0
  if (PAD)
487
0
  {
488
0
    gradXTmp = gradX + gradStride + 1;
489
0
    gradYTmp = gradY + gradStride + 1;
490
0
    for (int y = 0; y < heightInside; y++)
491
0
    {
492
0
      gradXTmp[-1] = gradXTmp[0];
493
0
      gradXTmp[widthInside] = gradXTmp[widthInside - 1];
494
0
      gradXTmp += gradStride;
495
496
0
      gradYTmp[-1] = gradYTmp[0];
497
0
      gradYTmp[widthInside] = gradYTmp[widthInside - 1];
498
0
      gradYTmp += gradStride;
499
0
    }
500
501
0
    gradXTmp = gradX + gradStride;
502
0
    gradYTmp = gradY + gradStride;
503
0
    ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel) * (width));
504
0
    ::memcpy(gradXTmp + heightInside * gradStride, gradXTmp + (heightInside - 1) * gradStride, sizeof(Pel) * (width));
505
0
    ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel) * (width));
506
0
    ::memcpy(gradYTmp + heightInside * gradStride, gradYTmp + (heightInside - 1) * gradStride, sizeof(Pel) * (width));
507
0
  }
508
0
}
Unexecuted instantiation: void vvenc::gradFilter_SSE<(vvenc::x86_simd::X86_VEXT)1, true>(short const*, int, int, int, int, short*, short*, int)
Unexecuted instantiation: void vvenc::gradFilter_SSE<(vvenc::x86_simd::X86_VEXT)1, false>(short const*, int, int, int, int, short*, short*, int)
Unexecuted instantiation: void vvenc::gradFilter_SSE<(vvenc::x86_simd::X86_VEXT)4, true>(short const*, int, int, int, int, short*, short*, int)
Unexecuted instantiation: void vvenc::gradFilter_SSE<(vvenc::x86_simd::X86_VEXT)4, false>(short const*, int, int, int, int, short*, short*, int)
509
510
template< X86_VEXT vext >
511
void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng)
512
0
{
513
0
  CHECKD( width != 4 || height != 4, "block width error!");
514
515
0
  const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
516
517
#if USE_AVX2
518
  __m256i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0, mm_src;
519
  __m256i mm_offset = _mm256_set1_epi16( offset );
520
  __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
521
  __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
522
  __m256i mm_dimin  = _mm256_set1_epi32( -dILimit );
523
  __m256i mm_dimax  = _mm256_set1_epi32( dILimit - 1 );
524
525
  const int *vX0 = dMvX, *vY0 = dMvY;
526
  const Pel *gX0 = gradX, *gY0 = gradY;
527
528
  // first two rows
529
  mm_dmvx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si128( ( const __m128i * ) vX0 ) ), _mm_loadu_si128( ( const __m128i * )( vX0 + dMvStride ) ), 1 );
530
  mm_dmvy = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si128( ( const __m128i * ) vY0 ) ), _mm_loadu_si128( ( const __m128i * )( vY0 + dMvStride ) ), 1 );
531
532
  mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
533
  mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
534
535
  mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _vv_loadl_epi64( ( __m128i* )gX0 ) ), _vv_loadl_epi64( ( __m128i* )( gX0 + gradStride ) ), 1 );
536
  mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _vv_loadl_epi64( ( __m128i* )gY0 ) ), _vv_loadl_epi64( ( __m128i* )( gY0 + gradStride ) ), 1 );
537
  
538
  mm_dI0   = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
539
  mm_dI0   = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI0 ) );
540
541
  // next two rows
542
  vX0 += ( dMvStride << 1 ); vY0 += ( dMvStride << 1 ); gX0 += ( gradStride << 1 ); gY0 += ( gradStride << 1 );
543
  
544
  mm_dmvx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si128( ( const __m128i * ) vX0 ) ), _mm_loadu_si128( ( const __m128i * )( vX0 + dMvStride ) ), 1 );
545
  mm_dmvy = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadu_si128( ( const __m128i * ) vY0 ) ), _mm_loadu_si128( ( const __m128i * )( vY0 + dMvStride ) ), 1 );
546
547
  mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
548
  mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
549
550
  mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _vv_loadl_epi64( ( __m128i* )gX0 ) ), _vv_loadl_epi64( ( __m128i* )( gX0 + gradStride ) ), 1 );
551
  mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _vv_loadl_epi64( ( __m128i* )gY0 ) ), _vv_loadl_epi64( ( __m128i* )( gY0 + gradStride ) ), 1 );
552
  
553
  mm_dI    = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
554
  mm_dI    = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI ) );
555
556
  // combine four rows
557
  mm_dI = _mm256_packs_epi32( mm_dI0, mm_dI );
558
  const Pel* src0 = srcPel + srcStride;
559
  mm_src = _mm256_inserti128_si256(
560
    _mm256_castsi128_si256(_mm_unpacklo_epi64(_vv_loadl_epi64((const __m128i *)srcPel), _vv_loadl_epi64((const __m128i *)(srcPel + (srcStride << 1))))),
561
    _mm_unpacklo_epi64(_vv_loadl_epi64((const __m128i *)src0), _vv_loadl_epi64((const __m128i *)(src0 + (srcStride << 1)))),
562
    1
563
  );
564
  mm_dI = _mm256_add_epi16(mm_dI, mm_src);
565
0
  if (!bi)
566
0
  {
567
0
    mm_dI = _mm256_srai_epi16(_mm256_adds_epi16(mm_dI, mm_offset), shiftNum);
568
0
    mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI));
569
0
  }
570
571
  // store final results
572
  __m128i dITmp = _mm256_extractf128_si256(mm_dI, 1);
573
  Pel* dst0 = dstPel;
574
0
  _vv_storel_epi64((__m128i *)dst0, _mm256_castsi256_si128(mm_dI));
575
0
  dst0 += dstStride; _vv_storel_epi64((__m128i *)dst0, dITmp);
576
0
  dst0 += dstStride; _vv_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(_mm256_castsi256_si128(mm_dI), _mm256_castsi256_si128(mm_dI)));
577
0
  dst0 += dstStride; _vv_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(dITmp, dITmp));
578
#else
579
  __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0;
580
  __m128i mm_offset = _mm_set1_epi16( offset );
581
  __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
582
  __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
583
  __m128i mm_dimin  = _mm_set1_epi32( -dILimit );
584
  __m128i mm_dimax  = _mm_set1_epi32( dILimit - 1 );
585
586
0
  for( int h = 0; h < height; h += 2 )
587
0
  {
588
0
    const int* vX = dMvX;
589
0
    const int* vY = dMvY;
590
0
    const Pel* gX = gradX;
591
0
    const Pel* gY = gradY;
592
0
    const Pel* src = srcPel;
593
0
    Pel*       dst = dstPel;
594
595
    // first row
596
0
    mm_dmvx  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vX ), _mm_setzero_si128() );
597
0
    mm_dmvy  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vY ), _mm_setzero_si128() );
598
0
    mm_gradx = _vv_loadl_epi64( ( __m128i* ) gX );
599
0
    mm_grady = _vv_loadl_epi64( ( __m128i* ) gY );
600
0
    mm_dI0   = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
601
0
    mm_dI0   = _mm_min_epi32  ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI0 ) );
602
603
    // second row
604
0
    mm_dmvx  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vX + dMvStride ) ), _mm_setzero_si128() );
605
0
    mm_dmvy  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vY + dMvStride ) ), _mm_setzero_si128() );
606
0
    mm_gradx = _vv_loadl_epi64( ( __m128i* ) ( gX + gradStride ) );
607
0
    mm_grady = _vv_loadl_epi64( ( __m128i* ) ( gY + gradStride ) );
608
0
    mm_dI    = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
609
0
    mm_dI    = _mm_min_epi32  ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI ) );
610
611
    // combine both rows
612
0
    mm_dI = _mm_packs_epi32( mm_dI0, mm_dI );
613
0
    mm_dI = _mm_add_epi16  ( _mm_unpacklo_epi64( _vv_loadl_epi64( ( const __m128i * )src ), _vv_loadl_epi64( ( const __m128i * )( src + srcStride ) ) ), mm_dI );
614
0
    if (!bi)
615
0
    {
616
0
      mm_dI = _mm_srai_epi16(_mm_adds_epi16(mm_dI, mm_offset), shiftNum);
617
0
      mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI));
618
0
    }
619
620
0
    _vv_storel_epi64( ( __m128i * )  dst,                                   mm_dI );
621
0
    _vv_storel_epi64( ( __m128i * )( dst + dstStride ), _mm_unpackhi_epi64( mm_dI, mm_dI ) );
622
623
0
    dMvX   += (dMvStride  << 1);
624
0
    dMvY   += (dMvStride  << 1);
625
0
    gradX  += (gradStride << 1);
626
0
    gradY  += (gradStride << 1);
627
0
    srcPel += (srcStride  << 1);
628
0
    dstPel += (dstStride  << 1);
629
0
  }
630
#endif
631
0
}
Unexecuted instantiation: void vvenc::applyPROF_SSE<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short const*, int, int, int, short const*, short const*, int, int const*, int const*, int, bool const&, int, short, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::applyPROF_SSE<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short const*, int, int, int, short const*, short const*, int, int const*, int const*, int, bool const&, int, short, vvenc::ClpRng const&)
632
633
template<X86_VEXT vext>
634
void padDmvr_SSE( const Pel* src, const int srcStride, Pel* dst, const int dstStride, int width, int height, int padSize )
635
0
{
636
0
  _mm_prefetch( ( const char* )  src,            _MM_HINT_T0 );
637
0
  _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
638
639
0
  if( width == 7 && padSize == 1 )
640
0
  {
641
0
    const __m128i sl = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 13 );
642
0
    __m128i l = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) src ), sl );
643
644
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 * dstStride - 1 ), l );
645
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride     ), l );
646
647
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 0 * dstStride - 1 ), l );
648
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride     ), l );
649
650
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
651
0
    {
652
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
653
654
0
      l = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) src ), sl );
655
656
0
      _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 ), l );
657
0
      _mm_storeu_si128( ( __m128i* ) ( dst     ), l );
658
0
    }
659
660
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 ), l );
661
0
    _mm_storeu_si128( ( __m128i* ) ( dst     ), l );
662
0
  }
663
0
  else if( width == 11 && padSize == 1 )
664
0
  {
665
0
    const __m128i sl = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15 );
666
0
    __m128i l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[0] );
667
0
    __m128i l1 = _mm_shuffle_epi8( _vv_loadl_epi64( ( const __m128i* ) &src[8] ), sl );
668
669
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 * dstStride - 1 ), l0 );
670
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride     ), l0 );
671
0
    _vv_storel_epi64( ( __m128i* ) ( dst - 1 * dstStride + 8 ), l1 );
672
673
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 0 * dstStride - 1 ), l0 );
674
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride     ), l0 );
675
0
    _vv_storel_epi64( ( __m128i* ) ( dst - 0 * dstStride + 8 ), l1 );
676
677
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
678
0
    {
679
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
680
681
0
      l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[0] );
682
0
      l1 = _mm_shuffle_epi8( _vv_loadl_epi64( ( const __m128i* ) &src[8] ), sl );
683
684
0
      _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 ), l0 );
685
0
      _mm_storeu_si128( ( __m128i* ) ( dst     ), l0 );
686
0
      _vv_storel_epi64( ( __m128i* ) ( dst + 8 ), l1 );
687
0
    }
688
    
689
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 ), l0 );
690
0
    _mm_storeu_si128( ( __m128i* ) ( dst     ), l0 );
691
0
    _vv_storel_epi64( ( __m128i* ) ( dst + 8 ), l1 );
692
0
  }
693
0
  else if( width == 15 && padSize == 2 )
694
0
  {
695
0
    const __m128i sl = _mm_setr_epi8(  0,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 13 );
696
0
    const __m128i sb = _mm_setr_epi8(  0,  1, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
697
0
    const __m128i se = _mm_setr_epi8( 12, 13, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
698
0
    __m128i l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[0] );
699
0
    __m128i l1 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[8] ), sl );
700
0
    __m128i b  = _mm_shuffle_epi8( l0, sb );
701
0
    __m128i e  = _mm_shuffle_epi8( l1, se );
702
703
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst - 2 * dstStride -  2 ), b  );
704
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride      ), l0 );
705
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride +  8 ), l1 );
706
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 2 * dstStride + 16 ), e  );
707
708
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst - 1 * dstStride -  2 ), b  );
709
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride      ), l0 );
710
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride +  8 ), l1 );
711
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 * dstStride + 16 ), e  );
712
713
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst - 0 * dstStride -  2 ), b  );
714
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride      ), l0 );
715
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride +  8 ), l1 );
716
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 0 * dstStride + 16 ), e  );
717
718
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
719
0
    {
720
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
721
722
0
      l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[0] );
723
0
      l1 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[8] ), sl );
724
0
      b = _mm_shuffle_epi8( l0, sb );
725
0
      e = _mm_shuffle_epi8( l1, se );
726
      
727
0
      _mm_storeu_si32 ( ( __m128i* ) ( dst -  2 ), b  );
728
0
      _mm_storeu_si128( ( __m128i* ) ( dst      ), l0 );
729
0
      _mm_storeu_si128( ( __m128i* ) ( dst +  8 ), l1 );
730
0
      _mm_storeu_si16 ( ( __m128i* ) ( dst + 16 ), e  );
731
0
    }
732
    
733
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst -  2 ), b  );
734
0
    _mm_storeu_si128( ( __m128i* ) ( dst      ), l0 );
735
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  8 ), l1 );
736
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst + 16 ), e  );
737
738
0
    dst += dstStride;
739
740
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst -  2 ), b  );
741
0
    _mm_storeu_si128( ( __m128i* ) ( dst      ), l0 );
742
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  8 ), l1 );
743
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst + 16 ), e  );
744
0
  }
745
0
  else if( width == 23 && padSize == 2 )
746
0
  {
747
0
    const __m128i sl = _mm_setr_epi8(  0,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 13 );
748
0
    const __m128i sb = _mm_setr_epi8(  0,  1, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
749
0
    const __m128i se = _mm_setr_epi8( 12, 13, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
750
0
    __m128i l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 0] );
751
0
    __m128i l1 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 8] );
752
0
    __m128i l2 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[16] ), sl );
753
0
    __m128i b  = _mm_shuffle_epi8( l0, sb );
754
0
    __m128i e  = _mm_shuffle_epi8( l2, se );
755
756
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst - 2 * dstStride -  2 ), b );
757
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride      ), l0 );
758
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride +  8 ), l1 );
759
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 2 * dstStride + 16 ), l2 );
760
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 2 * dstStride + 24 ), e );
761
762
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst - 1 * dstStride -  2 ), b );
763
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride      ), l0 );
764
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride +  8 ), l1 );
765
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 1 * dstStride + 16 ), l2 );
766
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 1 * dstStride + 24 ), e );
767
768
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst - 0 * dstStride -  2 ), b );
769
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride      ), l0 );
770
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride +  8 ), l1 );
771
0
    _mm_storeu_si128( ( __m128i* ) ( dst - 0 * dstStride + 16 ), l2 );
772
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst - 0 * dstStride + 24 ), e );
773
774
0
    for( height--, dst += dstStride, src += srcStride; height > 0; height--, src += srcStride, dst += dstStride )
775
0
    {
776
0
      _mm_prefetch( ( const char* ) &src[srcStride], _MM_HINT_T0 );
777
778
0
      l0 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 0] );
779
0
      l1 =                   _mm_loadu_si128( ( const __m128i* ) &src[ 8] );
780
0
      l2 = _mm_shuffle_epi8( _mm_loadu_si128( ( const __m128i* ) &src[16] ), sl );
781
0
      b  = _mm_shuffle_epi8( l0, sb );
782
0
      e  = _mm_shuffle_epi8( l2, se );
783
784
0
      _mm_storeu_si32 ( ( __m128i* ) ( dst -  2 ), b );
785
0
      _mm_storeu_si128( ( __m128i* ) ( dst      ), l0 );
786
0
      _mm_storeu_si128( ( __m128i* ) ( dst +  8 ), l1 );
787
0
      _mm_storeu_si128( ( __m128i* ) ( dst + 16 ), l2 );
788
0
      _mm_storeu_si16 ( ( __m128i* ) ( dst + 24 ), e );
789
0
    }
790
    
791
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst -  2 ), b );
792
0
    _mm_storeu_si128( ( __m128i* ) ( dst      ), l0 );
793
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  8 ), l1 );
794
0
    _mm_storeu_si128( ( __m128i* ) ( dst + 16 ), l2 );
795
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst + 24 ), e );
796
797
0
    dst += dstStride;
798
799
0
    _mm_storeu_si32 ( ( __m128i* ) ( dst -  2 ), b );
800
0
    _mm_storeu_si128( ( __m128i* ) ( dst      ), l0 );
801
0
    _mm_storeu_si128( ( __m128i* ) ( dst +  8 ), l1 );
802
0
    _mm_storeu_si128( ( __m128i* ) ( dst + 16 ), l2 );
803
0
    _mm_storeu_si16 ( ( __m128i* ) ( dst + 24 ), e );
804
0
  }
805
0
  else
806
0
  {
807
    // TODO: fix for 444!
808
809
0
    g_pelBufOP.copyBuffer( ( const char* ) src, srcStride * sizeof( Pel ), ( char* ) dst, dstStride * sizeof( Pel ), width * sizeof( Pel ), height );
810
811
    /*left and right padding*/
812
0
    Pel* ptrTemp1 = dst;
813
0
    Pel* ptrTemp2 = dst + (width - 1);
814
0
    ptrdiff_t offset = 0;
815
0
    for( int i = 0; i < height; i++ )
816
0
    {
817
0
      offset = dstStride * i;
818
0
      for( int j = 1; j <= padSize; j++ )
819
0
      {
820
0
        *(ptrTemp1 - j + offset) = *(ptrTemp1 + offset);
821
0
        *(ptrTemp2 + j + offset) = *(ptrTemp2 + offset);
822
0
      }
823
0
    }
824
    /*Top and Bottom padding*/
825
0
    int numBytes = (width + padSize + padSize) * sizeof( Pel );
826
0
    ptrTemp1 = (dst - padSize);
827
0
    ptrTemp2 = (dst + (dstStride * (height - 1)) - padSize);
828
0
    for( int i = 1; i <= padSize; i++ )
829
0
    {
830
0
      memcpy( ptrTemp1 - (i * dstStride), (ptrTemp1), numBytes );
831
0
      memcpy( ptrTemp2 + (i * dstStride), (ptrTemp2), numBytes );
832
0
    }
833
0
  }
834
0
}
Unexecuted instantiation: void vvenc::padDmvr_SSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::padDmvr_SSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, int, short*, int, int, int, int)
835
836
#if ENABLE_SIMD_OPT_BDOF
837
template<X86_VEXT vext>
838
void InterPredInterpolation::_initInterPredictionX86()
839
0
{
840
0
  xFpBiDirOptFlow     = BiOptFlowCoreSIMD<vext>;
841
0
  xFpBDOFGradFilter   = gradFilter_SSE<vext>;
842
0
  xFpProfGradFilter   = gradFilter_SSE<vext, false>;
843
0
  xFpApplyPROF        = applyPROF_SSE<vext>;
844
0
  xFpPadDmvr          = padDmvr_SSE<vext>;
845
0
}
Unexecuted instantiation: void vvenc::InterPredInterpolation::_initInterPredictionX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::InterPredInterpolation::_initInterPredictionX86<(vvenc::x86_simd::X86_VEXT)4>()
846
template void InterPredInterpolation::_initInterPredictionX86<SIMDX86>();
847
848
#endif
849
} // namespace vvenc
850
851
//! \}
852
853
#endif // TARGET_SIMD_X86
854
//! \}