Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/**
43
 * \file
44
 * \brief Implementation of AffineGradientSearch class
45
 */
46
//#define USE_AVX2
47
// ====================================================================================================================
48
// Includes
49
// ====================================================================================================================
50
51
#include "CommonDefX86.h"
52
#include "../AffineGradientSearch.h"
53
54
//! \ingroup CommonLib
55
//! \{
56
57
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_AFFINE_ME
58
59
namespace vvenc {
60
61
  template<X86_VEXT vext>
62
  static void simdHorizontalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height)
63
0
  {
64
0
    CHECK( width % 8, "Invalid size!" );
65
66
    // pPred is 10-bit
67
68
    // -1 0 1
69
    // -2 0 2
70
    // -1 0 1
71
    // 
72
    // sum( sobel ) = 8, i.e. 4-bit extension
73
74
0
    for( int y = 1; y < ( height - 1 ); y++ )
75
0
    {
76
0
      int x = 1;
77
0
      for( ; x < ( width - 8 ); x += 8 )
78
0
      {
79
0
        __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] );
80
0
        acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc );
81
0
        acc         = _mm_slli_epi16( acc, 1 );
82
0
        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
83
0
        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
84
0
        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
85
0
        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
86
87
0
        _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc );
88
0
      }
89
90
0
      __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] );
91
0
      acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc );
92
0
      acc         = _mm_slli_epi16( acc, 1 );
93
0
      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
94
0
      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
95
0
      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
96
0
      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
97
98
0
      _vv_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x],                         acc );
99
0
      _mm_storeu_si32 (              &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) );
100
101
0
      pDerivate[y * derivateBufStride]               = pDerivate[y * derivateBufStride + 1];
102
0
      pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)];
103
0
    }
104
105
0
    memcpy( pDerivate,                                      pDerivate + derivateBufStride,                  width * sizeof( pDerivate[ 0 ] ) );
106
0
    memcpy( pDerivate + ( height - 1 ) * derivateBufStride, pDerivate + ( height - 2 ) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) );
107
0
  }
Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdHorizontalSobelFilter<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int)
Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdHorizontalSobelFilter<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int)
108
109
  template<X86_VEXT vext>
110
  static void simdVerticalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height)
111
0
  {
112
0
    CHECK( width % 8, "Invalid size!" );
113
114
    // pPred is 10-bit
115
116
    // -1 -2 -1
117
    //  0  0  0
118
    //  1  2  1
119
    // 
120
    // sum( sobel ) = 8, i.e. 4-bit extension
121
122
0
    for( int y = 1; y < ( height - 1 ); y++ )
123
0
    {
124
0
      int x = 1;
125
0
      for( ; x < ( width - 8 ); x += 8 )
126
0
      {
127
0
        __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] );
128
0
        acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc );
129
0
        acc         = _mm_slli_epi16( acc, 1 );
130
0
        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
131
0
        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
132
0
        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
133
0
        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
134
135
0
        _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc );
136
0
      }
137
      
138
0
      __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] );
139
0
      acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc );
140
0
      acc         = _mm_slli_epi16( acc, 1 );
141
0
      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
142
0
      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
143
0
      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
144
0
      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
145
146
0
      _vv_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x],                         acc );
147
0
      _mm_storeu_si32 (              &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) );
148
149
0
      pDerivate[y * derivateBufStride]               = pDerivate[y * derivateBufStride + 1];
150
0
      pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)];
151
0
    }
152
153
0
    memcpy( pDerivate,                                    pDerivate + derivateBufStride,                width * sizeof( pDerivate[ 0 ] ) );
154
0
    memcpy( pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) );
155
0
  }
Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdVerticalSobelFilter<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int)
Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdVerticalSobelFilter<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int)
156
157
158
159
0
#define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation)       \
160
0
{                                                                                                              \
161
0
inter0 = _mm_mul_epi32(x1, y1);                                                                                \
162
0
inter1 = _mm_mul_epi32(tmp0, tmp2);                                                                            \
163
0
inter2 = _mm_mul_epi32(x2, y2);                                                                                \
164
0
inter3 = _mm_mul_epi32(tmp1, tmp3);                                                                            \
165
0
inter2 = _mm_add_epi64(inter0, inter2);                                                                        \
166
0
inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
167
0
inter0 = _vv_loadl_epi64(loadLocation);                                                                        \
168
0
inter3 = _mm_add_epi64(inter2, inter3);                                                                        \
169
0
inter1 = _mm_srli_si128(inter3, 8);                                                                            \
170
0
inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
171
0
inter3 = _mm_add_epi64(inter0, inter3);                                                                        \
172
0
}
173
174
  template<X86_VEXT vext, bool b6Param>
175
  static void simdEqualCoeffComputer(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7])
176
0
  {
177
0
    __m128i mmFour;
178
0
    __m128i mmTmp[4];
179
0
    __m128i mmIntermediate[4];
180
0
    __m128i mmIndxK, mmIndxJ;
181
0
    __m128i mmResidue[2];
182
0
    __m128i mmC[12];
183
184
    // Add directly to indexes to get new index
185
0
    mmFour  = _mm_set1_epi32(4);
186
0
    mmIndxJ = _mm_set1_epi32(-2);
187
188
189
0
    static constexpr int n = b6Param ? 6 : 4;
190
0
    int idx1 = -2 * derivateBufStride - 4;
191
0
    int idx2 = -    derivateBufStride - 4;
192
0
    int resIdx1 = -2 * residueStride - 4;
193
0
    int resIdx2 = -    residueStride - 4;
194
195
0
    for (int j = 0; j < height; j += 2)
196
0
    {
197
0
      if (!(j & 3))
198
0
        mmIndxJ = _mm_add_epi32(mmIndxJ, mmFour);
199
0
      mmIndxK = _mm_set1_epi32(-2);
200
0
      idx1 += (derivateBufStride << 1);
201
0
      idx2 += (derivateBufStride << 1);
202
0
      resIdx1 += (residueStride << 1);
203
0
      resIdx2 += (residueStride << 1);
204
205
0
      for (int k = 0; k < width; k += 4)
206
0
      {
207
0
        idx1 += 4;
208
0
        idx2 += 4;
209
0
        resIdx1 += 4;
210
0
        resIdx2 += 4;
211
0
        mmIndxK = _mm_add_epi32(mmIndxK, mmFour);
212
213
0
        if (b6Param)
214
0
        {
215
          // mmC[0-5] for iC[0-5] of 1st row of pixels
216
0
          mmC[0] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx1]));
217
0
          mmC[2] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx1]));
218
0
          mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]);
219
0
          mmC[3] = _mm_mullo_epi32(mmIndxK, mmC[2]);
220
0
          mmC[4] = _mm_mullo_epi32(mmIndxJ, mmC[0]);
221
0
          mmC[5] = _mm_mullo_epi32(mmIndxJ, mmC[2]);
222
223
          // mmC[6-11] for iC[0-5] of 2nd row of pixels
224
0
          mmC[6] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx2]));
225
0
          mmC[8] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx2]));
226
0
          mmC[7] = _mm_mullo_epi32(mmIndxK, mmC[6]);
227
0
          mmC[9] = _mm_mullo_epi32(mmIndxK, mmC[8]);
228
0
          mmC[10] = _mm_mullo_epi32(mmIndxJ, mmC[6]);
229
0
          mmC[11] = _mm_mullo_epi32(mmIndxJ, mmC[8]);
230
0
        }
231
0
        else
232
0
        {
233
          // mmC[0-3] for iC[0-3] of 1st row of pixels
234
0
          mmC[0] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx1]));
235
0
          mmC[2] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx1]));
236
0
          mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]);
237
0
          mmC[3] = _mm_mullo_epi32(mmIndxJ, mmC[0]);
238
0
          mmTmp[0] = _mm_mullo_epi32(mmIndxJ, mmC[2]);
239
0
          mmTmp[1] = _mm_mullo_epi32(mmIndxK, mmC[2]);
240
0
          mmC[1] = _mm_add_epi32(mmC[1], mmTmp[0]);
241
0
          mmC[3] = _mm_sub_epi32(mmC[3], mmTmp[1]);
242
243
          // mmC[4-7] for iC[0-3] of 1st row of pixels
244
0
          mmC[4] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx2]));
245
0
          mmC[6] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx2]));
246
0
          mmC[5] = _mm_mullo_epi32(mmIndxK, mmC[4]);
247
0
          mmC[7] = _mm_mullo_epi32(mmIndxJ, mmC[4]);
248
0
          mmTmp[2] = _mm_mullo_epi32(mmIndxJ, mmC[6]);
249
0
          mmTmp[3] = _mm_mullo_epi32(mmIndxK, mmC[6]);
250
0
          mmC[5] = _mm_add_epi32(mmC[5], mmTmp[2]);
251
0
          mmC[7] = _mm_sub_epi32(mmC[7], mmTmp[3]);
252
0
        }
253
254
        // Residue
255
0
        mmResidue[0] = _vv_loadl_epi64((const __m128i*)&pResidue[resIdx1]);
256
0
        mmResidue[1] = _vv_loadl_epi64((const __m128i*)&pResidue[resIdx2]);
257
0
        mmResidue[0] = _mm_cvtepi16_epi32(mmResidue[0]);
258
0
        mmResidue[1] = _mm_cvtepi16_epi32(mmResidue[1]);
259
0
        mmResidue[0] = _mm_slli_epi32(mmResidue[0], 3);
260
0
        mmResidue[1] = _mm_slli_epi32(mmResidue[1], 3);
261
262
        // Calculation of coefficient matrix
263
0
        for (int col = 0; col < n; col++)
264
0
        {
265
0
          mmTmp[0] = _mm_srli_si128(mmC[0 + col], 4);
266
0
          mmTmp[1] = _mm_srli_si128(mmC[n + col], 4);
267
0
          CALC_EQUAL_COEFF_8PXLS(mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][col]);
268
0
          _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], mmIntermediate[3]);
269
270
0
          for (int row = col + 1; row < n; row++)
271
0
          {
272
0
            mmTmp[2] = _mm_srli_si128(mmC[0 + row], 4);
273
0
            mmTmp[3] = _mm_srli_si128(mmC[n + row], 4);
274
0
            CALC_EQUAL_COEFF_8PXLS(mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][row]);
275
0
            _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], mmIntermediate[3]);
276
0
            _vv_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], mmIntermediate[3]);
277
0
          }
278
279
0
          mmTmp[2] = _mm_srli_si128(mmResidue[0], 4);
280
0
          mmTmp[3] = _mm_srli_si128(mmResidue[1], 4);
281
0
          CALC_EQUAL_COEFF_8PXLS(mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][n]);
282
0
          _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], mmIntermediate[3]);
283
0
        }
284
0
      }
285
286
0
      idx1 -= (width);
287
0
      idx2 -= (width);
288
0
      resIdx1 -= (width);
289
0
      resIdx2 -= (width);
290
0
    }
291
0
  }
Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdEqualCoeffComputer<(vvenc::x86_simd::X86_VEXT)1, false>(short*, int, short**, int, int, int, long (*) [7])
Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdEqualCoeffComputer<(vvenc::x86_simd::X86_VEXT)1, true>(short*, int, short**, int, int, int, long (*) [7])
292
293
#if USE_AVX2
294
295
0
#define CALC_EQUAL_COEFF_8PXLS_AVX2(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,res,loadLocation)  \
296
0
{                                                                                                                  \
297
0
inter0 = _mm256_mul_epi32(x1, y1);                                                                                 \
298
0
inter1 = _mm256_mul_epi32(tmp0, tmp2);                                                                             \
299
0
inter2 = _mm256_mul_epi32(x2, y2);                                                                                 \
300
0
inter3 = _mm256_mul_epi32(tmp1, tmp3);                                                                             \
301
0
inter2 = _mm256_add_epi64(inter0, inter2);                                                                         \
302
0
inter3 = _mm256_add_epi64(inter1, inter3);                                                                         \
303
0
res    = _vv_loadl_epi64(loadLocation);                                                                            \
304
0
inter3 = _mm256_add_epi64(inter2, inter3);                                                                         \
305
0
inter1 = _mm256_srli_si256(inter3, 8);                                                                             \
306
0
inter3 = _mm256_add_epi64(inter1, inter3);                                                                         \
307
0
res    = _mm_add_epi64(res, _mm256_castsi256_si128(inter3));                                                       \
308
0
res    = _mm_add_epi64(res, _mm256_extracti128_si256(inter3, 1));                                                  \
309
0
}
310
311
  template<bool b6Param>
312
  static void simdEqualCoeffComputer_avx2(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7])
313
0
  {
314
0
    __m256i mmFour;
315
0
    __m256i mmTmp[4];
316
0
    __m256i mmIntermediate[4];
317
0
    __m256i mmIndxK, mmIndxJ;
318
0
    __m256i mmResidue[2];
319
0
    __m256i mmC[12];
320
0
    __m128i mmRes;
321
322
    // Add directly to indexes to get new index
323
0
    mmFour  = _mm256_set1_epi32(4);
324
0
    mmIndxJ = _mm256_set1_epi32(-2);
325
326
0
    static constexpr int n = b6Param ? 6 : 4;
327
0
    int idx1 = -2 * derivateBufStride - 8;
328
0
    int idx2 = -    derivateBufStride - 8;
329
0
    int resIdx1 = -2 * residueStride - 8;
330
0
    int resIdx2 = -    residueStride - 8;
331
332
0
    for (int j = 0; j < height; j += 2)
333
0
    {
334
0
      if (!(j & 3))
335
0
        mmIndxJ = _mm256_add_epi32(mmIndxJ, mmFour);
336
0
      mmIndxK = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( -6 ) ), _mm_set1_epi32( -2 ), 1 );
337
0
      idx1 += (derivateBufStride << 1);
338
0
      idx2 += (derivateBufStride << 1);
339
0
      resIdx1 += (residueStride << 1);
340
0
      resIdx2 += (residueStride << 1);
341
342
0
      for (int k = 0; k < width; k += 8)
343
0
      {
344
0
        idx1 += 8;
345
0
        idx2 += 8;
346
0
        resIdx1 += 8;
347
0
        resIdx2 += 8;
348
0
        mmIndxK = _mm256_add_epi32(mmIndxK, mmFour);
349
0
        mmIndxK = _mm256_add_epi32(mmIndxK, mmFour);
350
351
0
        if (b6Param)
352
0
        {
353
          // mmC[0-5] for iC[0-5] of 1st row of pixels
354
0
          mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]));
355
0
          mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]));
356
0
          mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]);
357
0
          mmC[3] = _mm256_mullo_epi32(mmIndxK, mmC[2]);
358
0
          mmC[4] = _mm256_mullo_epi32(mmIndxJ, mmC[0]);
359
0
          mmC[5] = _mm256_mullo_epi32(mmIndxJ, mmC[2]);
360
361
          // mmC[6-11] for iC[0-5] of 2nd row of pixels
362
0
          mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]));
363
0
          mmC[8] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]));
364
0
          mmC[7] = _mm256_mullo_epi32(mmIndxK, mmC[6]);
365
0
          mmC[9] = _mm256_mullo_epi32(mmIndxK, mmC[8]);
366
0
          mmC[10] = _mm256_mullo_epi32(mmIndxJ, mmC[6]);
367
0
          mmC[11] = _mm256_mullo_epi32(mmIndxJ, mmC[8]);
368
0
        }
369
0
        else
370
0
        {
371
          // mmC[0-3] for iC[0-3] of 1st row of pixels
372
0
          mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]));
373
0
          mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]));
374
0
          mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]);
375
0
          mmC[3] = _mm256_mullo_epi32(mmIndxJ, mmC[0]);
376
0
          mmTmp[0] = _mm256_mullo_epi32(mmIndxJ, mmC[2]);
377
0
          mmTmp[1] = _mm256_mullo_epi32(mmIndxK, mmC[2]);
378
0
          mmC[1] = _mm256_add_epi32(mmC[1], mmTmp[0]);
379
0
          mmC[3] = _mm256_sub_epi32(mmC[3], mmTmp[1]);
380
381
          // mmC[4-7] for iC[0-3] of 1st row of pixels
382
0
          mmC[4] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]));
383
0
          mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]));
384
0
          mmC[5] = _mm256_mullo_epi32(mmIndxK, mmC[4]);
385
0
          mmC[7] = _mm256_mullo_epi32(mmIndxJ, mmC[4]);
386
0
          mmTmp[2] = _mm256_mullo_epi32(mmIndxJ, mmC[6]);
387
0
          mmTmp[3] = _mm256_mullo_epi32(mmIndxK, mmC[6]);
388
0
          mmC[5] = _mm256_add_epi32(mmC[5], mmTmp[2]);
389
0
          mmC[7] = _mm256_sub_epi32(mmC[7], mmTmp[3]);
390
0
        }
391
392
        // Residue
393
0
        mmResidue[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[resIdx1]));
394
0
        mmResidue[1] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[resIdx2]));
395
0
        mmResidue[0] = _mm256_slli_epi32(mmResidue[0], 3);
396
0
        mmResidue[1] = _mm256_slli_epi32(mmResidue[1], 3);
397
398
        // Calculation of coefficient matrix
399
0
        for (int col = 0; col < n; col++)
400
0
        {
401
0
          mmTmp[0] = _mm256_srli_si256(mmC[0 + col], 4);
402
0
          mmTmp[1] = _mm256_srli_si256(mmC[n + col], 4);
403
0
          CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][col]);
404
0
          _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], mmRes);
405
406
0
          for (int row = col + 1; row < n; row++)
407
0
          {
408
0
            mmTmp[2] = _mm256_srli_si256(mmC[0 + row], 4);
409
0
            mmTmp[3] = _mm256_srli_si256(mmC[n + row], 4);
410
0
            CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][row]);
411
0
            _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], mmRes);
412
0
            _vv_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], mmRes);
413
0
          }
414
415
0
          mmTmp[2] = _mm256_srli_si256(mmResidue[0], 4);
416
0
          mmTmp[3] = _mm256_srli_si256(mmResidue[1], 4);
417
0
          CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][n]);
418
0
          _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], mmRes);
419
0
        }
420
0
      }
421
422
0
      idx1 -= (width);
423
0
      idx2 -= (width);
424
0
      resIdx1 -= (width);
425
0
      resIdx2 -= (width);
426
0
    }
427
0
  }
Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdEqualCoeffComputer_avx2<false>(short*, int, short**, int, int, int, long (*) [7])
Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdEqualCoeffComputer_avx2<true>(short*, int, short**, int, int, int, long (*) [7])
428
#endif
429
430
  template <X86_VEXT vext>
431
  void AffineGradientSearch::_initAffineGradientSearchX86()
432
0
  {
433
0
    m_HorizontalSobelFilter = simdHorizontalSobelFilter<vext>;
434
0
    m_VerticalSobelFilter   = simdVerticalSobelFilter<vext>;
435
#if USE_AVX2
436
    m_EqualCoeffComputer[0] = simdEqualCoeffComputer_avx2<false>;
437
    m_EqualCoeffComputer[1] = simdEqualCoeffComputer_avx2<true>;
438
#else
439
    m_EqualCoeffComputer[0] = simdEqualCoeffComputer<vext, false>;
440
    m_EqualCoeffComputer[1] = simdEqualCoeffComputer<vext, true>;
441
#endif
442
0
  }
Unexecuted instantiation: void vvenc::AffineGradientSearch::_initAffineGradientSearchX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::AffineGradientSearch::_initAffineGradientSearchX86<(vvenc::x86_simd::X86_VEXT)4>()
443
444
  template void AffineGradientSearch::_initAffineGradientSearchX86<SIMDX86>();
445
446
}
447
448
#endif //#ifdef TARGET_SIMD_X86
449
//! \}