Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/MCTFX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/**
43
 * \file
44
 * \brief Implementation of AffineGradientSearch class
45
 */
46
//#define USE_AVX2
47
// ====================================================================================================================
48
// Includes
49
// ====================================================================================================================
50
51
#include "CommonDefX86.h"
52
53
#include "MCTF.h"
54
55
//! \ingroup CommonLib
56
//! \{
57
58
0
#define cond_mm_prefetch(a,b) _mm_prefetch(a,b)
59
//#define cond_mm_prefetch(a,b)
60
61
#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_MCTF
62
63
namespace vvenc {
64
65
template<X86_VEXT vext>
66
int motionErrorLumaInt_SIMD( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int besterror )
67
0
{
68
0
  int error = 0;
69
0
  __m128i xerror = _mm_setzero_si128();
70
71
0
  cond_mm_prefetch( ( const char* ) ( org ),              _MM_HINT_T0 );
72
0
  cond_mm_prefetch( ( const char* ) ( org + origStride ), _MM_HINT_T0 );
73
0
  cond_mm_prefetch( ( const char* ) ( buf ),              _MM_HINT_T0 );
74
0
  cond_mm_prefetch( ( const char* ) ( buf + buffStride ), _MM_HINT_T0 );
75
76
0
  CHECK( w & 7, "SIMD blockSize needs to be a multiple of 8" );
77
78
#if USE_AVX2
79
0
  if( ( w & 15 ) == 0 && vext >= AVX2 )
80
0
  {
81
0
    for( int y1 = 0; y1 < h; y1 += 2 )
82
0
    {
83
0
      const Pel* origRowStart   = org + y1 * origStride;
84
0
      const Pel* bufferRowStart = buf + y1 * buffStride;
85
86
0
      cond_mm_prefetch( ( const char* ) ( origRowStart   + 2 * origStride ), _MM_HINT_T0 );
87
0
      cond_mm_prefetch( ( const char* ) ( origRowStart   + 3 * origStride ), _MM_HINT_T0 );
88
0
      cond_mm_prefetch( ( const char* ) ( bufferRowStart + 2 * buffStride ), _MM_HINT_T0 );
89
0
      cond_mm_prefetch( ( const char* ) ( bufferRowStart + 3 * buffStride ), _MM_HINT_T0 );
90
91
      __m256i vsum = _mm256_setzero_si256();
92
93
0
      for( int x1 = 0; x1 < w; x1 += 16 )
94
0
      {
95
0
        __m256i vorg1 = _mm256_loadu_si256( ( const __m256i* ) &origRowStart[x1] );
96
0
        __m256i vorg2 = _mm256_loadu_si256( ( const __m256i* ) &origRowStart[x1+origStride] );
97
0
        __m256i vbuf1 = _mm256_loadu_si256( ( const __m256i* ) &bufferRowStart[x1] );
98
0
        __m256i vbuf2 = _mm256_loadu_si256( ( const __m256i* ) &bufferRowStart[x1+buffStride] );
99
100
0
        __m256i vsum1 = _mm256_sub_epi16( vorg1, vbuf1 );
101
0
        __m256i vsum2 = _mm256_sub_epi16( vorg2, vbuf2 );
102
103
0
        __m256i vtmp1 = _mm256_madd_epi16( vsum1, vsum1 );
104
0
        __m256i vtmp2 = _mm256_madd_epi16( vsum2, vsum2 );
105
106
0
        vsum          = _mm256_add_epi32 ( vsum,  vtmp1 );
107
0
        vsum          = _mm256_add_epi32 ( vsum,  vtmp2 );
108
109
        //int diff = origRowStart[x1] - bufferRowStart[x1];
110
        //error += diff * diff;
111
        //diff = origRowStart[x1 + 1] - bufferRowStart[x1 + 1];
112
        //error += diff * diff;
113
0
      }
114
      
115
      __m128i
116
      xtmp = _mm256_extractf128_si256( vsum, 1 );
117
      xtmp = _mm_add_epi32( xtmp, _mm256_castsi256_si128( vsum ) );
118
      xerror = _mm_hadd_epi32( xerror, xtmp );
119
      error = _mm_cvtsi128_si32( xerror );
120
121
0
      if( error > besterror )
122
0
      {
123
0
        return error;
124
0
      }
125
0
    }
126
127
0
    xerror = _mm_hadd_epi32( xerror, xerror );
128
0
    xerror = _mm_hadd_epi32( xerror, xerror );
129
0
    error = _mm_cvtsi128_si32( xerror );
130
0
    return error;
131
0
  }
132
0
#endif
133
0
  for( int y1 = 0; y1 < h; y1 += 2 )
134
0
  {
135
0
    const Pel* origRowStart   = org + y1 * origStride;
136
0
    const Pel* bufferRowStart = buf + y1 * buffStride;
137
138
0
    __m128i xsum   = _mm_setzero_si128();
139
140
0
    for( int x1 = 0; x1 < w; x1 += 8 )
141
0
    {
142
0
      __m128i xorg1 = _mm_loadu_si128( ( const __m128i* ) &origRowStart[x1] );
143
0
      __m128i xorg2 = _mm_loadu_si128( ( const __m128i* ) &origRowStart[x1+origStride] );
144
0
      __m128i xbuf1 = _mm_loadu_si128( ( const __m128i* ) &bufferRowStart[x1] );
145
0
      __m128i xbuf2 = _mm_loadu_si128( ( const __m128i* ) &bufferRowStart[x1+buffStride] );
146
147
0
      __m128i xsum1 = _mm_sub_epi16( xorg1, xbuf1 );
148
0
      __m128i xsum2 = _mm_sub_epi16( xorg2, xbuf2 );
149
150
0
      __m128i xtmp1 = _mm_madd_epi16( xsum1, xsum1 );
151
0
      __m128i xtmp2 = _mm_madd_epi16( xsum2, xsum2 );
152
153
0
      xsum          = _mm_add_epi32 ( xsum,  xtmp1 );
154
0
      xsum          = _mm_add_epi32 ( xsum,  xtmp2 );
155
156
      //int diff = origRowStart[x1] - bufferRowStart[x1];
157
      //error += diff * diff;
158
      //diff = origRowStart[x1 + 1] - bufferRowStart[x1 + 1];
159
      //error += diff * diff;
160
0
    }
161
    
162
0
    xerror = _mm_hadd_epi32   ( xerror, xsum );
163
0
    error  = _mm_cvtsi128_si32( xerror );
164
165
0
    if( error > besterror )
166
0
    {
167
0
      return error;
168
0
    }
169
0
  }
170
171
0
  xerror = _mm_hadd_epi32( xerror, xerror );
172
0
  xerror = _mm_hadd_epi32( xerror, xerror );
173
0
  error  = _mm_cvtsi128_si32( xerror );
174
0
  return error;
175
0
}
Unexecuted instantiation: int vvenc::motionErrorLumaInt_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short const*, long, int, int, int)
Unexecuted instantiation: int vvenc::motionErrorLumaInt_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short const*, long, int, int, int)
176
177
template<X86_VEXT vext>
178
int motionErrorLumaFrac_SIMD( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth, const int besterror )
179
0
{
180
0
  int error = 0;
181
0
  const int base = -3;
182
0
  __m128i xerror = _mm_setzero_si128();
183
  
184
0
  CHECK( w & 7, "SIMD blockSize needs to be a multiple of 8" );
185
186
0
  const Pel maxSampleValue = ( 1 << bitDepth ) - 1;
187
188
#if USE_AVX2
189
0
  if( vext >= AVX2 && ( w & 15 ) == 0 )
190
0
  {
191
0
    const __m256i yfilt12 = _mm256_unpacklo_epi16( _mm256_set1_epi16( yFilter[1] ), _mm256_set1_epi16( yFilter[2] ) );
192
0
    const __m256i yfilt34 = _mm256_unpacklo_epi16( _mm256_set1_epi16( yFilter[3] ), _mm256_set1_epi16( yFilter[4] ) );
193
0
    const __m256i yfilt56 = _mm256_unpacklo_epi16( _mm256_set1_epi16( yFilter[5] ), _mm256_set1_epi16( yFilter[6] ) );
194
195
    const __m256i xfilt12 = _mm256_unpacklo_epi16( _mm256_set1_epi16( xFilter[1] ), _mm256_set1_epi16( xFilter[2] ) );
196
    const __m256i xfilt34 = _mm256_unpacklo_epi16( _mm256_set1_epi16( xFilter[3] ), _mm256_set1_epi16( xFilter[4] ) );
197
    const __m256i xfilt56 = _mm256_unpacklo_epi16( _mm256_set1_epi16( xFilter[5] ), _mm256_set1_epi16( xFilter[6] ) );
198
199
    const __m256i xmax = _mm256_set1_epi16( maxSampleValue );
200
    const __m256i xmin = _mm256_setzero_si256();
201
202
    const int yOffset = 1 - 3;
203
    const Pel* sourceCol = buf + base + yOffset * buffStride;
204
    const Pel* origCol = org;
205
206
0
    for( int x1 = 0; x1 < w; x1 += 16, sourceCol += 16, origCol += 16 )
207
0
    {
208
0
      const Pel* origRow = origCol;
209
0
      const Pel* rowStart = sourceCol;
210
211
0
      __m256i xsrc[6];
212
213
0
      for( int y1 = 1; y1 < h + 6; y1++, rowStart += buffStride )
214
0
      {
215
0
        __m256i xsrc1 = _mm256_loadu_si256( ( const __m256i* ) & rowStart[1] );
216
0
        __m256i xsrc2 = _mm256_loadu_si256( ( const __m256i* ) & rowStart[2] );
217
0
        __m256i xsrc3 = _mm256_loadu_si256( ( const __m256i* ) & rowStart[3] );
218
0
        __m256i xsrc4 = _mm256_loadu_si256( ( const __m256i* ) & rowStart[4] );
219
0
        __m256i xsrc5 = _mm256_loadu_si256( ( const __m256i* ) & rowStart[5] );
220
0
        __m256i xsrc6 = _mm256_loadu_si256( ( const __m256i* ) & rowStart[6] );
221
222
0
        __m256i
223
0
          xsum0 = _mm256_set1_epi32( 1 << 5 );
224
0
        __m256i
225
0
          xsum1 = _mm256_set1_epi32( 1 << 5 );
226
227
0
        xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( _mm256_unpacklo_epi16( xsrc1, xsrc2 ), xfilt12 ) );
228
0
        xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( _mm256_unpackhi_epi16( xsrc1, xsrc2 ), xfilt12 ) );
229
230
0
        xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( _mm256_unpacklo_epi16( xsrc3, xsrc4 ), xfilt34 ) );
231
0
        xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( _mm256_unpackhi_epi16( xsrc3, xsrc4 ), xfilt34 ) );
232
233
0
        xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( _mm256_unpacklo_epi16( xsrc5, xsrc6 ), xfilt56 ) );
234
0
        xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( _mm256_unpackhi_epi16( xsrc5, xsrc6 ), xfilt56 ) );
235
236
0
        xsum0 = _mm256_srai_epi32( xsum0, 6 );
237
0
        xsum1 = _mm256_srai_epi32( xsum1, 6 );
238
0
        __m256i
239
0
        xsum = _mm256_packs_epi32( xsum0, xsum1 );
240
0
        xsum = _mm256_min_epi16( xmax, _mm256_max_epi16( xmin, xsum ) );
241
242
0
        if( y1 >= 6 )
243
0
        {
244
0
          xsrc[0] = xsrc[1];
245
0
          xsrc[1] = xsrc[2];
246
0
          xsrc[2] = xsrc[3];
247
0
          xsrc[3] = xsrc[4];
248
0
          xsrc[4] = xsrc[5];
249
0
          xsrc[5] = xsum;
250
251
0
          xsum0 = _mm256_set1_epi32( 1 << 5 );
252
0
          xsum1 = _mm256_set1_epi32( 1 << 5 );
253
254
0
          xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( yfilt12, _mm256_unpacklo_epi16( xsrc[0], xsrc[1] ) ) );
255
0
          xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( yfilt12, _mm256_unpackhi_epi16( xsrc[0], xsrc[1] ) ) );
256
257
0
          xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( yfilt34, _mm256_unpacklo_epi16( xsrc[2], xsrc[3] ) ) );
258
0
          xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( yfilt34, _mm256_unpackhi_epi16( xsrc[2], xsrc[3] ) ) );
259
260
0
          xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( yfilt56, _mm256_unpacklo_epi16( xsrc[4], xsrc[5] ) ) );
261
0
          xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( yfilt56, _mm256_unpackhi_epi16( xsrc[4], xsrc[5] ) ) );
262
263
0
          xsum0 = _mm256_srai_epi32( xsum0, 6 );
264
0
          xsum1 = _mm256_srai_epi32( xsum1, 6 );
265
266
0
          xsum = _mm256_packs_epi32( xsum0, xsum1 );
267
0
          xsum = _mm256_min_epi16( xmax, _mm256_max_epi16( xmin, xsum ) );
268
269
0
          __m256i
270
0
          xorg  = _mm256_loadu_si256( ( const __m256i* ) origRow );
271
0
          origRow += origStride;
272
273
0
          xsum = _mm256_sub_epi16( xsum, xorg );
274
0
          xsum = _mm256_madd_epi16( xsum, xsum );
275
        
276
0
          __m128i
277
0
          ysum = _mm_add_epi32( _mm256_castsi256_si128( xsum ), _mm256_extracti128_si256( xsum, 1 ) );
278
0
          xerror = _mm_hadd_epi32( xerror, ysum );
279
0
          error = _mm_cvtsi128_si32( xerror );
280
281
0
          if( error > besterror )
282
0
          {
283
0
            return error;
284
0
          }
285
0
        }
286
0
        else
287
0
        {
288
0
          xsrc[y1] = xsum;
289
0
        }
290
0
      }
291
0
    }
292
293
0
    xerror = _mm_hadd_epi32( xerror, xerror );
294
0
    xerror = _mm_hadd_epi32( xerror, xerror );
295
0
    error  = _mm_cvtsi128_si32( xerror );
296
297
0
    return error;
298
0
  }
299
0
#endif
300
301
0
  const __m128i yfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[1] ), _mm_set1_epi16( yFilter[2] ) );
302
0
  const __m128i yfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[3] ), _mm_set1_epi16( yFilter[4] ) );
303
0
  const __m128i yfilt56 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[5] ), _mm_set1_epi16( yFilter[6] ) );
304
305
0
  const __m128i xfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[1] ), _mm_set1_epi16( xFilter[2] ) );
306
0
  const __m128i xfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[3] ), _mm_set1_epi16( xFilter[4] ) );
307
0
  const __m128i xfilt56 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[5] ), _mm_set1_epi16( xFilter[6] ) );
308
  
309
0
  const __m128i xmax   = _mm_set1_epi16( maxSampleValue );
310
0
  const __m128i xmin   = _mm_setzero_si128();
311
  
312
0
  const int yOffset    = 1 - 3;
313
0
  const Pel* sourceCol = buf + base + yOffset * buffStride;
314
0
  const Pel* origCol   = org;
315
316
0
  for( int x1 = 0; x1 < w; x1 += 8, sourceCol += 8, origCol += 8 )
317
0
  {
318
0
    const Pel* origRow  = origCol;
319
0
    const Pel* rowStart = sourceCol;
320
321
0
    __m128i xsrc[6];
322
323
0
    for( int y1 = 1; y1 < h + 6; y1++, rowStart += buffStride )
324
0
    {
325
0
      __m128i xsrc1 = _mm_loadu_si128( ( const __m128i * ) &rowStart[1] );
326
0
      __m128i xsrc2 = _mm_loadu_si128( ( const __m128i * ) &rowStart[2] );
327
0
      __m128i xsrc3 = _mm_loadu_si128( ( const __m128i * ) &rowStart[3] );
328
0
      __m128i xsrc4 = _mm_loadu_si128( ( const __m128i * ) &rowStart[4] );
329
0
      __m128i xsrc5 = _mm_loadu_si128( ( const __m128i * ) &rowStart[5] );
330
0
      __m128i xsrc6 = _mm_loadu_si128( ( const __m128i * ) &rowStart[6] );
331
332
0
      __m128i
333
0
      xsum0 = _mm_set1_epi32( 1 << 5 );
334
0
      __m128i
335
0
      xsum1 = _mm_set1_epi32( 1 << 5 );
336
337
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc1, xsrc2 ), xfilt12 ) );
338
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc1, xsrc2 ), xfilt12 ) );
339
340
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc3, xsrc4 ), xfilt34 ) );
341
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc3, xsrc4 ), xfilt34 ) );
342
343
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc5, xsrc6 ), xfilt56 ) );
344
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc5, xsrc6 ), xfilt56 ) );
345
346
0
      xsum0 = _mm_srai_epi32( xsum0, 6 );
347
0
      xsum1 = _mm_srai_epi32( xsum1, 6 );
348
0
      __m128i
349
0
      xsum  = _mm_packs_epi32( xsum0, xsum1 );
350
0
      xsum  = _mm_min_epi16( xmax, _mm_max_epi16( xmin, xsum ) );
351
352
0
      if( y1 >= 6 )
353
0
      {
354
0
        xsrc[0] = xsrc[1];
355
0
        xsrc[1] = xsrc[2];
356
0
        xsrc[2] = xsrc[3];
357
0
        xsrc[3] = xsrc[4];
358
0
        xsrc[4] = xsrc[5];
359
0
        xsrc[5] = xsum;
360
        
361
0
        xsum0 = _mm_set1_epi32( 1 << 5 );
362
0
        xsum1 = _mm_set1_epi32( 1 << 5 );
363
364
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt12, _mm_unpacklo_epi16( xsrc[0], xsrc[1] ) ) );
365
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt12, _mm_unpackhi_epi16( xsrc[0], xsrc[1] ) ) );
366
367
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt34, _mm_unpacklo_epi16( xsrc[2], xsrc[3] ) ) );
368
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt34, _mm_unpackhi_epi16( xsrc[2], xsrc[3] ) ) );
369
370
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt56, _mm_unpacklo_epi16( xsrc[4], xsrc[5] ) ) );
371
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt56, _mm_unpackhi_epi16( xsrc[4], xsrc[5] ) ) );
372
        
373
0
        xsum0 = _mm_srai_epi32( xsum0, 6 );
374
0
        xsum1 = _mm_srai_epi32( xsum1, 6 );
375
376
0
        xsum  = _mm_packs_epi32( xsum0, xsum1 );
377
0
        xsum  = _mm_min_epi16  ( xmax, _mm_max_epi16( xmin, xsum ) );
378
379
0
        __m128i
380
0
        xorg = _mm_loadu_si128( ( const __m128i * ) origRow );
381
0
        origRow += origStride;
382
383
0
        xsum = _mm_sub_epi16 ( xsum, xorg );
384
0
        xsum = _mm_madd_epi16( xsum, xsum );
385
0
        xerror = _mm_hadd_epi32( xerror, xsum );
386
0
        error  = _mm_cvtsi128_si32( xerror );
387
388
        //sum = 0;
389
        //sum += yFilter[1] * tempArray[y1 + 1][x1];
390
        //sum += yFilter[2] * tempArray[y1 + 2][x1];
391
        //sum += yFilter[3] * tempArray[y1 + 3][x1];
392
        //sum += yFilter[4] * tempArray[y1 + 4][x1];
393
        //sum += yFilter[5] * tempArray[y1 + 5][x1];
394
        //sum += yFilter[6] * tempArray[y1 + 6][x1];
395
        //
396
        //sum = ( sum + ( 1 << 5 ) ) >> 6;
397
        //sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum );
398
        //
399
        //error += ( sum - origRow[x + x1] ) * ( sum - origRow[x + x1] );
400
401
0
        if( error > besterror )
402
0
        {
403
0
          return error;
404
0
        }
405
0
      }
406
0
      else
407
0
      {
408
0
        xsrc[y1] = xsum;
409
      
410
        //sum  = 0;
411
        //sum += xFilter[1] * rowStart[1];
412
        //sum += xFilter[2] * rowStart[2];
413
        //sum += xFilter[3] * rowStart[3];
414
        //sum += xFilter[4] * rowStart[4];
415
        //sum += xFilter[5] * rowStart[5];
416
        //sum += xFilter[6] * rowStart[6];
417
        //
418
        //sum = ( sum + ( 1 << 5 ) ) >> 6;
419
        //sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum );
420
        //
421
        //tempArray[y1][x1] = sum;
422
0
      }
423
0
    }
424
0
  }
425
426
0
  xerror = _mm_hadd_epi32( xerror, xerror );
427
0
  xerror = _mm_hadd_epi32( xerror, xerror );
428
0
  error  = _mm_cvtsi128_si32( xerror );
429
430
0
  return error;
431
0
}
Unexecuted instantiation: int vvenc::motionErrorLumaFrac_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short const*, long, int, int, short const*, short const*, int, int)
Unexecuted instantiation: int vvenc::motionErrorLumaFrac_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short const*, long, int, int, short const*, short const*, int, int)
432
433
434
template<X86_VEXT vext>
435
int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth, const int besterror )
436
0
{
437
0
  int error = 0;
438
0
  const int base = -1;
439
0
  __m128i xerror = _mm_setzero_si128();
440
  
441
0
  CHECK( w & 7, "SIMD blockSize needs to be a multiple of 8" );
442
443
0
  cond_mm_prefetch( ( const char* )   org,                            _MM_HINT_T0 );
444
0
  cond_mm_prefetch( ( const char* ) ( buf + base + -1 * buffStride ), _MM_HINT_T0 );
445
446
#if USE_AVX2
447
0
  if( vext >= AVX2 && ( w & 15 ) == 0 )
448
0
  {
449
0
    GCC_WARNING_DISABLE_maybe_uninitialized
450
0
    const Pel maxSampleValue = ( 1 << bitDepth ) - 1;
451
452
    const __m256i yfilt12 = _mm256_unpacklo_epi16( _mm256_set1_epi16( yFilter[0] ), _mm256_set1_epi16( yFilter[1] ) );
453
    const __m256i yfilt34 = _mm256_unpacklo_epi16( _mm256_set1_epi16( yFilter[2] ), _mm256_set1_epi16( yFilter[3] ) );
454
455
    const __m256i xfilt12 = _mm256_unpacklo_epi16( _mm256_set1_epi16( xFilter[0] ), _mm256_set1_epi16( xFilter[1] ) );
456
    const __m256i xfilt34 = _mm256_unpacklo_epi16( _mm256_set1_epi16( xFilter[2] ), _mm256_set1_epi16( xFilter[3] ) );
457
  
458
    const __m256i xmax   = _mm256_set1_epi16( maxSampleValue );
459
    const __m256i xmin = _mm256_setzero_si256();
460
  
461
    const int yOffset    = -1;
462
    const Pel* sourceCol = buf + base + yOffset * buffStride;
463
    const Pel* origCol   = org;
464
    __m256i verror = _mm256_setzero_si256();
465
466
0
    for( int x1 = 0; x1 < w; x1 += 16, sourceCol += 16, origCol += 16 )
467
0
    {
468
0
      const Pel* origRow  = origCol;
469
0
      const Pel* rowStart = sourceCol;
470
471
#ifdef NDEBUG
472
      __m256i vsrc0, vsrc1, vsrc2, vsrc3;
473
#else
474
0
      __m256i
475
0
        vsrc0 = _mm256_setzero_si256(), vsrc1 = _mm256_setzero_si256(),
476
0
        vsrc2 = _mm256_setzero_si256(), vsrc3 = _mm256_setzero_si256();
477
0
#endif
478
479
0
      for( int y1 = 0; y1 < h + 3; y1++, rowStart += buffStride )
480
0
      {
481
0
        cond_mm_prefetch( ( const char* ) ( origRow  + origStride ), _MM_HINT_T0 );
482
0
        cond_mm_prefetch( ( const char* ) ( rowStart + buffStride ), _MM_HINT_T0 );
483
484
0
        __m256i xsrc1 = _mm256_loadu_si256( ( const __m256i * ) &rowStart[0] );
485
0
        __m256i xsrc2 = _mm256_loadu_si256( ( const __m256i * ) &rowStart[1] );
486
0
        __m256i xsrc3 = _mm256_loadu_si256( ( const __m256i * ) &rowStart[2] );
487
0
        __m256i xsrc4 = _mm256_loadu_si256( ( const __m256i * ) &rowStart[3] );
488
489
0
        __m256i
490
0
        xsum0 = _mm256_set1_epi32( 1 << 5 );
491
0
        __m256i
492
0
        xsum1 = _mm256_set1_epi32( 1 << 5 );
493
494
0
        xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( _mm256_unpacklo_epi16( xsrc1, xsrc2 ), xfilt12 ) );
495
0
        xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( _mm256_unpackhi_epi16( xsrc1, xsrc2 ), xfilt12 ) );
496
497
0
        xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( _mm256_unpacklo_epi16( xsrc3, xsrc4 ), xfilt34 ) );
498
0
        xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( _mm256_unpackhi_epi16( xsrc3, xsrc4 ), xfilt34 ) );
499
500
0
        xsum0 = _mm256_srai_epi32( xsum0, 6 );
501
0
        xsum1 = _mm256_srai_epi32( xsum1, 6 );
502
0
        __m256i
503
0
        xsum  = _mm256_packs_epi32( xsum0, xsum1 );
504
0
        xsum  = _mm256_min_epi16( xmax, _mm256_max_epi16( xmin, xsum ) );
505
506
0
        if( y1 >= 3 )
507
0
        {
508
0
          vsrc0 = vsrc1;
509
0
          vsrc1 = vsrc2;
510
0
          vsrc2 = vsrc3;
511
0
          vsrc3 = xsum;
512
        
513
0
          xsum0 = _mm256_set1_epi32( 1 << 5 );
514
0
          xsum1 = _mm256_set1_epi32( 1 << 5 );
515
516
0
          xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( yfilt12, _mm256_unpacklo_epi16( vsrc0, vsrc1 ) ) );
517
0
          xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( yfilt12, _mm256_unpackhi_epi16( vsrc0, vsrc1 ) ) );
518
519
0
          xsum0 = _mm256_add_epi32( xsum0, _mm256_madd_epi16( yfilt34, _mm256_unpacklo_epi16( vsrc2, vsrc3 ) ) );
520
0
          xsum1 = _mm256_add_epi32( xsum1, _mm256_madd_epi16( yfilt34, _mm256_unpackhi_epi16( vsrc2, vsrc3 ) ) );
521
        
522
0
          xsum0 = _mm256_srai_epi32( xsum0, 6 );
523
0
          xsum1 = _mm256_srai_epi32( xsum1, 6 );
524
525
0
          xsum  = _mm256_packs_epi32( xsum0, xsum1 );
526
0
          xsum  = _mm256_min_epi16  ( xmax, _mm256_max_epi16( xmin, xsum ) );
527
528
0
          __m256i
529
0
          xorg  = _mm256_loadu_si256( ( const __m256i * ) origRow );
530
0
          origRow += origStride;
531
532
0
          xsum = _mm256_sub_epi16( xsum, xorg );
533
0
          xsum = _mm256_madd_epi16( xsum, xsum );
534
535
0
          verror = _mm256_add_epi32( verror, xsum );
536
0
        }
537
0
        else
538
0
        {
539
0
          vsrc1 = vsrc2;
540
0
          vsrc2 = vsrc3;
541
0
          vsrc3 = xsum;
542
0
        }
543
0
      }
544
0
    }
545
546
    GCC_WARNING_RESET
547
548
    xerror = _mm_add_epi32( _mm256_castsi256_si128( verror ), _mm256_extracti128_si256( verror , 1 ) );
549
    xerror = _mm_hadd_epi32( xerror, xerror );
550
    xerror = _mm_hadd_epi32( xerror, xerror );
551
    error  = _mm_cvtsi128_si32( xerror );
552
553
0
    return error;
554
0
  }
555
0
#endif 
556
557
0
  const Pel maxSampleValue = ( 1 << bitDepth ) - 1;
558
559
0
  const __m128i yfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[0] ), _mm_set1_epi16( yFilter[1] ) );
560
0
  const __m128i yfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[2] ), _mm_set1_epi16( yFilter[3] ) );
561
562
0
  const __m128i xfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[0] ), _mm_set1_epi16( xFilter[1] ) );
563
0
  const __m128i xfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[2] ), _mm_set1_epi16( xFilter[3] ) );
564
  
565
0
  const __m128i xmax   = _mm_set1_epi16( maxSampleValue );
566
0
  const __m128i xmin   = _mm_setzero_si128();
567
  
568
0
  const int yOffset    = -1;
569
0
  const Pel* sourceCol = buf + base + yOffset * buffStride;
570
0
  const Pel* origCol   = org;
571
572
0
  for( int x1 = 0; x1 < w; x1 += 8, sourceCol += 8, origCol += 8 )
573
0
  {
574
0
    const Pel* origRow  = origCol;
575
0
    const Pel* rowStart = sourceCol;
576
577
0
    __m128i xsrc[4];
578
579
0
    for( int y1 = 0; y1 < h + 3; y1++, rowStart += buffStride )
580
0
    {
581
0
      cond_mm_prefetch( ( const char* ) ( origRow + origStride  ), _MM_HINT_T0 );
582
0
      cond_mm_prefetch( ( const char* ) ( rowStart + buffStride ), _MM_HINT_T0 );
583
584
0
      __m128i xsrc1 = _mm_loadu_si128( ( const __m128i * ) &rowStart[0] );
585
0
      __m128i xsrc2 = _mm_loadu_si128( ( const __m128i * ) &rowStart[1] );
586
0
      __m128i xsrc3 = _mm_loadu_si128( ( const __m128i * ) &rowStart[2] );
587
0
      __m128i xsrc4 = _mm_loadu_si128( ( const __m128i * ) &rowStart[3] );
588
589
0
      __m128i
590
0
      xsum0 = _mm_set1_epi32( 1 << 5 );
591
0
      __m128i
592
0
      xsum1 = _mm_set1_epi32( 1 << 5 );
593
594
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc1, xsrc2 ), xfilt12 ) );
595
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc1, xsrc2 ), xfilt12 ) );
596
597
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc3, xsrc4 ), xfilt34 ) );
598
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc3, xsrc4 ), xfilt34 ) );
599
600
0
      xsum0 = _mm_srai_epi32( xsum0, 6 );
601
0
      xsum1 = _mm_srai_epi32( xsum1, 6 );
602
0
      __m128i
603
0
      xsum  = _mm_packs_epi32( xsum0, xsum1 );
604
0
      xsum  = _mm_min_epi16( xmax, _mm_max_epi16( xmin, xsum ) );
605
606
0
      if( y1 >= 3 )
607
0
      {
608
0
        xsrc[0] = xsrc[1];
609
0
        xsrc[1] = xsrc[2];
610
0
        xsrc[2] = xsrc[3];
611
0
        xsrc[3] = xsum;
612
        
613
0
        xsum0 = _mm_set1_epi32( 1 << 5 );
614
0
        xsum1 = _mm_set1_epi32( 1 << 5 );
615
616
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt12, _mm_unpacklo_epi16( xsrc[0], xsrc[1] ) ) );
617
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt12, _mm_unpackhi_epi16( xsrc[0], xsrc[1] ) ) );
618
619
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt34, _mm_unpacklo_epi16( xsrc[2], xsrc[3] ) ) );
620
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt34, _mm_unpackhi_epi16( xsrc[2], xsrc[3] ) ) );
621
        
622
0
        xsum0 = _mm_srai_epi32( xsum0, 6 );
623
0
        xsum1 = _mm_srai_epi32( xsum1, 6 );
624
625
0
        xsum  = _mm_packs_epi32( xsum0, xsum1 );
626
0
        xsum  = _mm_min_epi16  ( xmax, _mm_max_epi16( xmin, xsum ) );
627
628
0
        __m128i
629
0
        xorg = _mm_loadu_si128( ( const __m128i * ) origRow );
630
0
        origRow += origStride;
631
632
0
        xsum = _mm_sub_epi16 ( xsum, xorg );
633
0
        xsum = _mm_madd_epi16( xsum, xsum );
634
0
        xerror = _mm_hadd_epi32( xerror, xsum );
635
0
        error = _mm_cvtsi128_si32( xerror );
636
637
        //sum = 0;
638
        //sum += yFilter[1] * tempArray[y1 + 1][x1];
639
        //sum += yFilter[2] * tempArray[y1 + 2][x1];
640
        //sum += yFilter[3] * tempArray[y1 + 3][x1];
641
        //sum += yFilter[4] * tempArray[y1 + 4][x1];
642
        //
643
        //sum = ( sum + ( 1 << 5 ) ) >> 6;
644
        //sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum );
645
        //
646
        //error += ( sum - origRow[x + x1] ) * ( sum - origRow[x + x1] );
647
648
0
        if( error > besterror )
649
0
        {
650
0
          return error;
651
0
        }
652
0
      }
653
0
      else
654
0
      {
655
0
        xsrc[y1 + 1] = xsum;
656
      
657
        //sum  = 0;
658
        //sum += xFilter[1] * rowStart[1];
659
        //sum += xFilter[2] * rowStart[2];
660
        //sum += xFilter[3] * rowStart[3];
661
        //sum += xFilter[4] * rowStart[4];
662
        //
663
        //sum = ( sum + ( 1 << 5 ) ) >> 6;
664
        //sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum );
665
        //
666
        //tempArray[y1][x1] = sum;
667
0
      }
668
0
    }
669
0
  }
670
671
0
  xerror = _mm_hadd_epi32( xerror, xerror );
672
0
  xerror = _mm_hadd_epi32( xerror, xerror );
673
0
  error  = _mm_cvtsi128_si32( xerror );
674
675
0
  return error;
676
0
}
Unexecuted instantiation: int vvenc::motionErrorLumaFrac_loRes_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short const*, long, int, int, short const*, short const*, int, int)
Unexecuted instantiation: int vvenc::motionErrorLumaFrac_loRes_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short const*, long, int, int, short const*, short const*, int, int)
677
678
template<X86_VEXT vext>
679
void applyFrac6tap_SIMD_8x( const Pel* org, const ptrdiff_t origStride, Pel* buf, const ptrdiff_t buffStride, const int bsx, const int bsy, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth )
680
0
{
681
0
  const int base = -3;
682
683
0
  CHECK( bsx & 7, "SIMD blockSizeX needs to be a multiple of 8" );
684
685
0
  const Pel maxSampleValue = ( 1 << bitDepth ) - 1;
686
687
0
  const __m128i yfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[1] ), _mm_set1_epi16( yFilter[2] ) );
688
0
  const __m128i yfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[3] ), _mm_set1_epi16( yFilter[4] ) );
689
0
  const __m128i yfilt56 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[5] ), _mm_set1_epi16( yFilter[6] ) );
690
691
0
  const __m128i xfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[1] ), _mm_set1_epi16( xFilter[2] ) );
692
0
  const __m128i xfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[3] ), _mm_set1_epi16( xFilter[4] ) );
693
0
  const __m128i xfilt56 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[5] ), _mm_set1_epi16( xFilter[6] ) );
694
695
0
  const __m128i xmax = _mm_set1_epi16( maxSampleValue );
696
0
  const __m128i xmin = _mm_setzero_si128();
697
698
0
  const int yOffset = 1 - 3;
699
0
  const Pel* srcCol = org + base + yOffset * origStride;
700
0
        Pel* dstCol = buf;
701
702
0
  for( int x1 = 0; x1 < bsx; x1 += 8, srcCol += 8, dstCol += 8 )
703
0
  {
704
0
    const Pel* srcRow = srcCol;
705
0
          Pel* dstRow = dstCol;
706
707
0
    __m128i xsrc[6];
708
709
0
    for( int y1 = 1; y1 < bsy + 6; y1++, srcRow += origStride )
710
0
    {
711
0
      __m128i xsrc1 = _mm_loadu_si128( ( const __m128i* ) &srcRow[1] );
712
0
      __m128i xsrc2 = _mm_loadu_si128( ( const __m128i* ) &srcRow[2] );
713
0
      __m128i xsrc3 = _mm_loadu_si128( ( const __m128i* ) &srcRow[3] );
714
0
      __m128i xsrc4 = _mm_loadu_si128( ( const __m128i* ) &srcRow[4] );
715
0
      __m128i xsrc5 = _mm_loadu_si128( ( const __m128i* ) &srcRow[5] );
716
0
      __m128i xsrc6 = _mm_loadu_si128( ( const __m128i* ) &srcRow[6] );
717
718
0
      __m128i
719
0
      xsum0 = _mm_set1_epi32( 1 << 5 );
720
0
      __m128i
721
0
      xsum1 = _mm_set1_epi32( 1 << 5 );
722
723
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc1, xsrc2 ), xfilt12 ) );
724
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc1, xsrc2 ), xfilt12 ) );
725
726
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc3, xsrc4 ), xfilt34 ) );
727
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc3, xsrc4 ), xfilt34 ) );
728
729
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc5, xsrc6 ), xfilt56 ) );
730
0
      xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( _mm_unpackhi_epi16( xsrc5, xsrc6 ), xfilt56 ) );
731
732
0
      xsum0 = _mm_srai_epi32( xsum0, 6 );
733
0
      xsum1 = _mm_srai_epi32( xsum1, 6 );
734
0
      __m128i
735
0
      xsum = _mm_packs_epi32( xsum0, xsum1 );
736
737
0
      if( y1 >= 6 )
738
0
      {
739
0
        xsrc[0] = xsrc[1];
740
0
        xsrc[1] = xsrc[2];
741
0
        xsrc[2] = xsrc[3];
742
0
        xsrc[3] = xsrc[4];
743
0
        xsrc[4] = xsrc[5];
744
0
        xsrc[5] = xsum;
745
746
0
        xsum0 = _mm_set1_epi32( 1 << 5 );
747
0
        xsum1 = _mm_set1_epi32( 1 << 5 );
748
749
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt12, _mm_unpacklo_epi16( xsrc[0], xsrc[1] ) ) );
750
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt12, _mm_unpackhi_epi16( xsrc[0], xsrc[1] ) ) );
751
752
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt34, _mm_unpacklo_epi16( xsrc[2], xsrc[3] ) ) );
753
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt34, _mm_unpackhi_epi16( xsrc[2], xsrc[3] ) ) );
754
755
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt56, _mm_unpacklo_epi16( xsrc[4], xsrc[5] ) ) );
756
0
        xsum1 = _mm_add_epi32( xsum1, _mm_madd_epi16( yfilt56, _mm_unpackhi_epi16( xsrc[4], xsrc[5] ) ) );
757
758
0
        xsum0 = _mm_srai_epi32( xsum0, 6 );
759
0
        xsum1 = _mm_srai_epi32( xsum1, 6 );
760
761
0
        xsum = _mm_packs_epi32( xsum0, xsum1 );
762
0
        xsum = _mm_min_epi16( xmax, _mm_max_epi16( xmin, xsum ) );
763
764
0
        _mm_storeu_si128( ( __m128i* ) dstRow, xsum );
765
0
        dstRow += buffStride;
766
0
      }
767
0
      else
768
0
      {
769
0
        xsrc[y1] = xsum;
770
0
      }
771
0
    }
772
0
  }
773
0
}
Unexecuted instantiation: void vvenc::applyFrac6tap_SIMD_8x<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short*, long, int, int, short const*, short const*, int)
Unexecuted instantiation: void vvenc::applyFrac6tap_SIMD_8x<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short*, long, int, int, short const*, short const*, int)
774
775
776
template<X86_VEXT vext>
777
void applyFrac6tap_SIMD_4x( const Pel* org, const ptrdiff_t origStride, Pel* buf, const ptrdiff_t buffStride, const int bsx, const int bsy, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth )
778
0
{
779
0
  const int base = -3;
780
781
0
  CHECK( bsx & 3, "SIMD blockSizeX needs to be a multiple of 4" );
782
783
0
  const Pel maxSampleValue = ( 1 << bitDepth ) - 1;
784
785
0
  const __m128i yfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[1] ), _mm_set1_epi16( yFilter[2] ) );
786
0
  const __m128i yfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[3] ), _mm_set1_epi16( yFilter[4] ) );
787
0
  const __m128i yfilt56 = _mm_unpacklo_epi16( _mm_set1_epi16( yFilter[5] ), _mm_set1_epi16( yFilter[6] ) );
788
789
0
  const __m128i xfilt12 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[1] ), _mm_set1_epi16( xFilter[2] ) );
790
0
  const __m128i xfilt34 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[3] ), _mm_set1_epi16( xFilter[4] ) );
791
0
  const __m128i xfilt56 = _mm_unpacklo_epi16( _mm_set1_epi16( xFilter[5] ), _mm_set1_epi16( xFilter[6] ) );
792
793
0
  const __m128i xmax = _mm_set1_epi16( maxSampleValue );
794
0
  const __m128i xmin = _mm_setzero_si128();
795
796
0
  const int yOffset = 1 - 3;
797
0
  const Pel* srcCol = org + base + yOffset * origStride;
798
0
        Pel* dstCol = buf;
799
800
0
  for( int x1 = 0; x1 < bsx; x1 += 4, srcCol += 4, dstCol += 4 )
801
0
  {
802
0
    const Pel* srcRow = srcCol;
803
0
          Pel* dstRow = dstCol;
804
805
0
    __m128i xsrc[6];
806
807
0
    for( int y1 = 1; y1 < bsy + 6; y1++, srcRow += origStride )
808
0
    {
809
0
      __m128i xsrc1 = _vv_loadl_epi64( ( const __m128i* ) &srcRow[1] );
810
0
      __m128i xsrc2 = _vv_loadl_epi64( ( const __m128i* ) &srcRow[2] );
811
0
      __m128i xsrc3 = _vv_loadl_epi64( ( const __m128i* ) &srcRow[3] );
812
0
      __m128i xsrc4 = _vv_loadl_epi64( ( const __m128i* ) &srcRow[4] );
813
0
      __m128i xsrc5 = _vv_loadl_epi64( ( const __m128i* ) &srcRow[5] );
814
0
      __m128i xsrc6 = _vv_loadl_epi64( ( const __m128i* ) &srcRow[6] );
815
816
0
      __m128i
817
0
      xsum0 = _mm_set1_epi32( 1 << 5 );
818
819
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc1, xsrc2 ), xfilt12 ) );
820
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc3, xsrc4 ), xfilt34 ) );
821
0
      xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( _mm_unpacklo_epi16( xsrc5, xsrc6 ), xfilt56 ) );
822
823
0
      xsum0 = _mm_srai_epi32( xsum0, 6 );
824
0
      __m128i
825
0
      xsum = _mm_packs_epi32( xsum0, _mm_setzero_si128() );
826
827
0
      if( y1 >= 6 )
828
0
      {
829
0
        xsrc[0] = xsrc[1];
830
0
        xsrc[1] = xsrc[2];
831
0
        xsrc[2] = xsrc[3];
832
0
        xsrc[3] = xsrc[4];
833
0
        xsrc[4] = xsrc[5];
834
0
        xsrc[5] = xsum;
835
836
0
        xsum0 = _mm_set1_epi32( 1 << 5 );
837
838
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt12, _mm_unpacklo_epi16( xsrc[0], xsrc[1] ) ) );
839
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt34, _mm_unpacklo_epi16( xsrc[2], xsrc[3] ) ) );
840
0
        xsum0 = _mm_add_epi32( xsum0, _mm_madd_epi16( yfilt56, _mm_unpacklo_epi16( xsrc[4], xsrc[5] ) ) );
841
842
0
        xsum0 = _mm_srai_epi32( xsum0, 6 );
843
0
        xsum = _mm_packs_epi32( xsum0, _mm_setzero_si128() );
844
0
        xsum = _mm_min_epi16( xmax, _mm_max_epi16( xmin, xsum ) );
845
846
0
        _vv_storel_epi64( ( __m128i* ) dstRow, xsum );
847
0
        dstRow += buffStride;
848
0
      }
849
0
      else
850
0
      {
851
0
        xsrc[y1] = xsum;
852
0
      }
853
0
    }
854
0
  }
855
0
}
Unexecuted instantiation: void vvenc::applyFrac6tap_SIMD_4x<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short*, long, int, int, short const*, short const*, int)
Unexecuted instantiation: void vvenc::applyFrac6tap_SIMD_4x<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short*, long, int, int, short const*, short const*, int)
856
857
static const int32_t xSzm[6] = {0, 1, 20, 336, 5440, 87296};
858
859
// works for bit depths up to incl. 12 and power-of-2 block dimensions in both directions
860
template<X86_VEXT vext>
861
void applyPlanarCorrectionSIMD( const Pel* refPel, const ptrdiff_t refStride, Pel* dstPel, const ptrdiff_t dstStride, const int32_t w, const int32_t h, const ClpRng& clpRng, const uint16_t motionError )
862
0
{
863
0
  const int32_t blockSize = w * h;
864
0
  const int32_t log2Width = floorLog2 (w);
865
0
  const int32_t maxPelVal = clpRng.max();
866
0
  const int32_t mWeight   = std::min (512u, (uint32_t) motionError * (uint32_t) motionError);
867
0
  const int32_t xSum      = (blockSize * (w - 1)) >> 1;
868
0
  int32_t x1yzm = 0,  x2yzm = 0,  ySum = 0;
869
0
  int32_t b0, b1, b2;
870
0
  int64_t numer, denom;
871
0
  int32_t tmp;
872
873
#if USE_AVX2
874
0
  if (w > 8)
875
0
  {
876
0
    __m256i vz;
877
0
    __m256i vzz = _mm256_set_epi32(0,0,0,0,0,0,0,0);
878
0
    __m256i vDst;
879
0
    __m256i vRef;
880
0
    __m256i vtmp;
881
0
    __m256i v16 = _mm256_set_epi16(16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16);
882
0
    __m256i v1 = _mm256_set_epi16(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1);
883
0
    __m256i vy = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
884
0
    __m256i vx2yzm = _mm256_set_epi32(0,0,0,0,0,0,0,0);
885
0
    __m256i vx1yzm = _mm256_set_epi32(0,0,0,0,0,0,0,0);
886
887
0
    for (int32_t y = 0; y < h; y++) // sum up dot-products between indices and sample diffs
888
0
    {
889
0
      __m256i vx = _mm256_set_epi16(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
890
891
0
      for (int32_t x = 0; x < w; x += 16)
892
0
      {
893
0
        vDst = _mm256_loadu_si256( (const __m256i* ) (dstPel+y*dstStride + x ));
894
0
        vRef = _mm256_loadu_si256( (const __m256i* ) (refPel+y*refStride + x ));
895
0
        vz = _mm256_sub_epi16 (vDst, vRef);
896
0
        vtmp = _mm256_madd_epi16(vz,vx);
897
0
        vx1yzm = _mm256_add_epi32(vx1yzm,vtmp);
898
0
        vtmp = _mm256_madd_epi16(vz,v1);
899
0
        vzz = _mm256_add_epi32(vzz,vtmp);
900
0
        vtmp = _mm256_madd_epi16(vz,vy);
901
0
        vx2yzm = _mm256_add_epi32(vx2yzm,vtmp);
902
0
        vx = _mm256_add_epi16(vx,v16);
903
0
      }
904
0
      vy = _mm256_add_epi16(vy,v1);
905
0
    }
906
    vx1yzm = _mm256_hadd_epi32(vx1yzm,vx1yzm);
907
    vx1yzm = _mm256_hadd_epi32(vx1yzm,vx1yzm);
908
    tmp = _mm256_extract_epi32 (vx1yzm, 0);
909
    x1yzm += tmp;
910
    tmp = _mm256_extract_epi32 (vx1yzm, 4);
911
    x1yzm += tmp;
912
913
    vx2yzm = _mm256_hadd_epi32(vx2yzm,vx2yzm);
914
    vx2yzm = _mm256_hadd_epi32(vx2yzm,vx2yzm);
915
    tmp = _mm256_extract_epi32 (vx2yzm, 0);
916
    x2yzm += tmp;
917
    tmp = _mm256_extract_epi32 (vx2yzm, 4);
918
    x2yzm += tmp;
919
920
0
    vzz = _mm256_hadd_epi32(vzz,vzz);
921
0
    vzz = _mm256_hadd_epi32(vzz,vzz);
922
0
    tmp = _mm256_extract_epi32 (vzz, 0);
923
0
    ySum += tmp;
924
0
    tmp = _mm256_extract_epi32 (vzz, 4);
925
0
    ySum += tmp;
926
0
  }
927
0
  else
928
0
#endif
929
0
  {
930
0
    __m128i vz;
931
0
    __m128i vzz = _mm_set_epi32(0,0,0,0);
932
0
    __m128i vDst;
933
0
    __m128i vRef;
934
0
    __m128i vtmp;
935
0
    __m128i v8 = _mm_set_epi16(8,8,8,8,8,8,8,8);
936
0
    __m128i vx2yzm = _mm_set_epi32(0,0,0,0);
937
0
    __m128i vx1yzm = _mm_set_epi32(0,0,0,0);
938
0
    __m128i v1 = _mm_set_epi16(1,1,1,1,1,1,1,1);
939
940
0
    if (w == 4)
941
0
    {
942
0
      __m128i vy = _mm_set_epi16(1,1,1,1,0,0,0,0);
943
0
      __m128i v2 = _mm_set_epi16(2,2,2,2,2,2,2,2);
944
945
0
      for (int32_t y = 0; y < h; y += 2) // sum up dot-products between indices and sample diffs
946
0
      {
947
0
        __m128i vx = _mm_set_epi16(3,2,1,0,3,2,1,0);
948
0
        vDst = _mm_loadu_si64( ( __m128i const * ) (dstPel+y*dstStride));
949
0
        vRef = _mm_loadu_si64( ( __m128i const * ) (refPel+y*refStride));
950
0
        __m128i vDsth = _mm_loadu_si64( ( __m128i const * ) (dstPel+(y+1)*dstStride));
951
0
        __m128i vRefh = _mm_loadu_si64( ( __m128i const * ) (refPel+(y+1)*refStride));
952
0
        vDsth = _mm_bslli_si128 (vDsth,8);
953
0
        vRefh = _mm_bslli_si128 (vRefh,8);
954
0
        vDst = _mm_or_si128(vDst,vDsth);
955
0
        vRef = _mm_or_si128(vRef,vRefh);
956
957
0
        vz = _mm_sub_epi16 (vDst, vRef);
958
0
        vtmp = _mm_madd_epi16(vz,vx);
959
0
        vx1yzm = _mm_add_epi32(vx1yzm,vtmp);
960
0
        vtmp = _mm_madd_epi16(vz,v1);
961
0
        vzz = _mm_add_epi32(vzz,vtmp);
962
0
        vtmp = _mm_madd_epi16(vz,vy);
963
0
        vx2yzm = _mm_add_epi32(vx2yzm,vtmp);
964
0
        vy = _mm_add_epi16(vy,v2);
965
0
      }
966
0
      vx1yzm = _mm_hadd_epi32(vx1yzm,vx1yzm);
967
0
      vx1yzm = _mm_hadd_epi32(vx1yzm,vx1yzm);
968
0
      tmp = _mm_extract_epi32 (vx1yzm, 0);
969
0
      x1yzm += tmp;
970
971
0
      vx2yzm = _mm_hadd_epi32(vx2yzm,vx2yzm);
972
0
      vx2yzm = _mm_hadd_epi32(vx2yzm,vx2yzm);
973
0
      tmp = _mm_extract_epi32 (vx2yzm, 0);
974
0
      x2yzm += tmp;
975
976
0
      vzz = _mm_hadd_epi32(vzz,vzz);
977
0
      vzz = _mm_hadd_epi32(vzz,vzz);
978
0
      tmp = _mm_extract_epi32 (vzz, 0);
979
0
      ySum += tmp;
980
0
    }
981
0
    else
982
0
    {
983
0
      __m128i vy = _mm_set_epi16(0,0,0,0,0,0,0,0);
984
0
      __m128i v1 = _mm_set_epi16(1,1,1,1,1,1,1,1);
985
986
0
      for (int32_t y = 0; y < h; y++) // sum up dot-products between indices and sample diffs
987
0
      {
988
0
        __m128i vx = _mm_set_epi16(7,6,5,4,3,2,1,0);
989
990
0
        for (int32_t x = 0; x < w; x += 8)
991
0
        {
992
0
          vDst = _mm_loadu_si128( ( __m128i const * ) (dstPel+y*dstStride + x ));
993
0
          vRef = _mm_loadu_si128( ( __m128i const * ) (refPel+y*refStride + x ));
994
0
          vz = _mm_sub_epi16 (vDst, vRef);
995
0
          vtmp = _mm_madd_epi16(vz,vx);
996
0
          vx1yzm = _mm_add_epi32(vx1yzm,vtmp);
997
0
          vtmp = _mm_madd_epi16(vz,v1);
998
0
          vzz = _mm_add_epi32(vzz,vtmp);
999
0
          vtmp = _mm_madd_epi16(vz,vy);
1000
0
          vx2yzm = _mm_add_epi32(vx2yzm,vtmp);
1001
0
          vx = _mm_add_epi16(vx,v8);
1002
0
        }
1003
0
        vy = _mm_add_epi16(vy,v1);
1004
0
      }
1005
0
      vx1yzm = _mm_hadd_epi32(vx1yzm,vx1yzm);
1006
0
      vx1yzm = _mm_hadd_epi32(vx1yzm,vx1yzm);
1007
0
      tmp = _mm_extract_epi32 (vx1yzm, 0);
1008
0
      x1yzm += tmp;
1009
1010
0
      vx2yzm = _mm_hadd_epi32(vx2yzm,vx2yzm);
1011
0
      vx2yzm = _mm_hadd_epi32(vx2yzm,vx2yzm);
1012
0
      tmp = _mm_extract_epi32 (vx2yzm, 0);
1013
0
      x2yzm += tmp;
1014
1015
0
      vzz = _mm_hadd_epi32(vzz,vzz);
1016
0
      vzz = _mm_hadd_epi32(vzz,vzz);
1017
0
      tmp = _mm_extract_epi32 (vzz, 0);
1018
0
      ySum += tmp;
1019
0
    }
1020
0
  }
1021
1022
0
  denom = blockSize * xSzm[log2Width]; // plane-fit parameters, in fixed-point arithmetic
1023
0
  numer = (int64_t) mWeight * ((int64_t) x1yzm * blockSize - xSum * ySum);
1024
0
  b1 = int32_t ((numer < 0 ? numer - (denom >> 1) : numer + (denom >> 1)) / denom);
1025
0
  b1 = (b1 < INT16_MIN ? INT16_MIN : (b1 > INT16_MAX ? INT16_MAX : b1));
1026
0
  numer = (int64_t) mWeight * ((int64_t) x2yzm * blockSize - xSum * ySum);
1027
0
  b2 = int32_t ((numer < 0 ? numer - (denom >> 1) : numer + (denom >> 1)) / denom);
1028
0
  b2 = (b2 > INT16_MAX ? INT16_MAX : (b2 < INT16_MIN ? INT16_MIN : b2));
1029
0
  b0 = (mWeight * ySum - (b1 + b2) * xSum + (blockSize >> 1)) >> (log2Width << 1);
1030
1031
0
  if (b0 == 0 && b1 == 0 && b2 == 0) return;
1032
1033
#if USE_AVX2
1034
0
  if (w > 8)
1035
0
  {
1036
0
    __m256i vb0 = _mm256_set1_epi32 (b0);
1037
0
    __m256i vb1 = _mm256_set1_epi16 ((int16_t) b1);
1038
0
    __m256i vb2 = _mm256_set1_epi16 ((int16_t) b2);
1039
0
    __m256i vy = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
1040
0
    __m256i v1 = _mm256_set_epi16(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1);
1041
0
    __m256i v0 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
1042
0
    __m256i v16 = _mm256_set_epi16(16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16);
1043
0
    __m256i v256 = _mm256_set_epi32 (256,256,256,256,256,256,256,256);
1044
1045
    __m256i vpelmin   = _mm256_set1_epi16(0);
1046
    __m256i vpelmax   = _mm256_set1_epi16(maxPelVal);
1047
1048
0
    for (int32_t y = 0; y < h; y++) // perform deblocking by adding fitted correction plane
1049
0
    {
1050
0
      __m256i vx = _mm256_set_epi16(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1051
1052
0
      for (int32_t x = 0; x < w; x += 16)
1053
0
      {
1054
0
        __m256i vDst = _mm256_loadu_si256( ( __m256i const * ) (dstPel+y*dstStride + x ));
1055
1056
0
        __m256i vDstl = _mm256_unpacklo_epi16 (vDst,v0);
1057
0
        __m256i vDsth = _mm256_unpackhi_epi16 (vDst,v0);
1058
1059
        //const int32_t p = (b0 + b1 * x + b2 * y + 256) >> 9; // fixed-point plane corrector
1060
0
        __m256i vtmph =_mm256_mulhi_epi16(vb1,vx);
1061
0
        __m256i vtmpl =_mm256_mullo_epi16(vb1,vx);
1062
0
        __m256i vb1l  = _mm256_unpacklo_epi16(vtmpl,vtmph);
1063
0
        __m256i vb1h  = _mm256_unpackhi_epi16(vtmpl,vtmph);
1064
1065
0
        vtmph =_mm256_mulhi_epi16(vb2,vy);
1066
0
        vtmpl =_mm256_mullo_epi16(vb2,vy);
1067
0
        __m256i vb2l  = _mm256_unpacklo_epi16(vtmpl,vtmph);
1068
0
        __m256i vb2h  = _mm256_unpackhi_epi16(vtmpl,vtmph);
1069
1070
0
        vb1l = _mm256_add_epi32(vb1l,vb2l);
1071
0
        vb1h = _mm256_add_epi32(vb1h,vb2h);
1072
1073
0
        vb1l = _mm256_add_epi32(vb1l,vb0);
1074
0
        vb1h = _mm256_add_epi32(vb1h,vb0);
1075
1076
0
        vb1l = _mm256_add_epi32(vb1l,v256);
1077
0
        vb1h = _mm256_add_epi32(vb1h,v256);
1078
1079
0
        vb1l = _mm256_srai_epi32 (vb1l,9);
1080
0
        vb1h = _mm256_srai_epi32 (vb1h,9);
1081
1082
0
        vDstl = _mm256_sub_epi32 (vDstl,vb1l);
1083
0
        vDsth = _mm256_sub_epi32 (vDsth,vb1h);
1084
1085
0
        vDst = _mm256_packs_epi32 (vDstl,vDsth);
1086
0
        vDst = _mm256_min_epi16( vpelmax, _mm256_max_epi16( vpelmin, vDst ) );
1087
0
        _mm256_storeu_si256 ((__m256i* ) (dstPel+y*dstStride + x ) ,vDst);
1088
1089
0
        vx = _mm256_add_epi16(vx,v16);
1090
0
      }
1091
0
      vy = _mm256_add_epi16(vy,v1);
1092
0
    }
1093
0
  }
1094
0
  else
1095
0
#endif
1096
0
  {
1097
0
    __m128i vb0 = _mm_set1_epi32 (b0);
1098
0
    __m128i vb1 = _mm_set1_epi16 ((int16_t) b1);
1099
0
    __m128i vb2 = _mm_set1_epi16 ((int16_t) b2);
1100
0
    __m128i v0 = _mm_set_epi16(0,0,0,0,0,0,0,0);
1101
0
    __m128i v256 = _mm_set_epi32 (256,256,256,256);
1102
1103
0
    __m128i vpelmin = _mm_set1_epi16(0);
1104
0
    __m128i vpelmax = _mm_set1_epi16(maxPelVal);
1105
1106
0
    if (w == 4)
1107
0
    {
1108
0
      __m128i vy = _mm_set_epi16(1,1,1,1,0,0,0,0);
1109
0
      __m128i v2 = _mm_set_epi16(2,2,2,2,2,2,2,2);
1110
1111
0
      for (int32_t y = 0; y < h; y += 2) // perform deblocking by adding fitted correction plane
1112
0
      {
1113
0
        __m128i vx = _mm_set_epi16(3,2,1,0,3,2,1,0);
1114
1115
0
        {
1116
0
          __m128i vDst = _mm_loadu_si64( ( __m128i const * ) (dstPel+y*dstStride));
1117
0
          __m128i vDsth = _mm_loadu_si64( ( __m128i const * ) (dstPel+(y+1)*dstStride));
1118
0
          vDsth = _mm_bslli_si128 (vDsth,8);
1119
0
          vDst = _mm_or_si128(vDst,vDsth);
1120
0
          __m128i vDstl = _mm_unpacklo_epi16 (vDst,v0);
1121
0
          vDsth = _mm_unpackhi_epi16 (vDst,v0);
1122
1123
          //const int32_t p = (b0 + b1 * x + b2 * y + 256) >> 9; // fixed-point plane corrector
1124
0
          __m128i vtmph =_mm_mulhi_epi16(vb1,vx);
1125
0
          __m128i vtmpl =_mm_mullo_epi16(vb1,vx);
1126
0
          __m128i vb1l  = _mm_unpacklo_epi16(vtmpl,vtmph);
1127
0
          __m128i vb1h  = _mm_unpackhi_epi16(vtmpl,vtmph);
1128
0
          vtmph =_mm_mulhi_epi16(vb2,vy);
1129
0
          vtmpl =_mm_mullo_epi16(vb2,vy);
1130
0
          __m128i vb2l  = _mm_unpacklo_epi16(vtmpl,vtmph);
1131
0
          __m128i vb2h  = _mm_unpackhi_epi16(vtmpl,vtmph);
1132
1133
0
          vb1l = _mm_add_epi32(vb1l,vb2l);
1134
0
          vb1h = _mm_add_epi32(vb1h,vb2h);
1135
1136
0
          vb1l = _mm_add_epi32(vb1l,vb0);
1137
0
          vb1h = _mm_add_epi32(vb1h,vb0);
1138
1139
0
          vb1l = _mm_add_epi32(vb1l,v256);
1140
0
          vb1h = _mm_add_epi32(vb1h,v256);
1141
1142
0
          vb1l = _mm_srai_epi32 (vb1l,9);
1143
0
          vb1h = _mm_srai_epi32 (vb1h,9);
1144
0
          vDstl = _mm_sub_epi32 (vDstl,vb1l);
1145
0
          vDsth = _mm_sub_epi32 (vDsth,vb1h);
1146
1147
0
          vDst = _mm_packs_epi32 (vDstl,vDsth);
1148
0
          vDst = _mm_min_epi16( vpelmax, _mm_max_epi16( vpelmin, vDst ) );
1149
1150
0
          _mm_storeu_si64 ((__m128i* ) (dstPel+y*dstStride),vDst);
1151
0
          _mm_storeu_si64 ((__m128i* ) (dstPel+(y+1)*dstStride),_mm_srli_si128(vDst,8));
1152
0
        }
1153
0
        vy = _mm_add_epi16(vy,v2);
1154
0
      }
1155
0
    }
1156
0
    else
1157
0
    {
1158
0
      __m128i vy = _mm_set_epi16(0,0,0,0,0,0,0,0);
1159
0
      __m128i v1 = _mm_set_epi16(1,1,1,1,1,1,1,1);
1160
0
      __m128i v8 = _mm_set_epi16(8,8,8,8,8,8,8,8);
1161
1162
0
      for (int32_t y = 0; y < h; y++) // perform deblocking by adding fitted correction plane
1163
0
      {
1164
0
        __m128i vx = _mm_set_epi16(7,6,5,4,3,2,1,0);
1165
1166
0
        for (int32_t x = 0; x < w; x += 8)
1167
0
        {
1168
0
          __m128i vDst = _mm_loadu_si128( ( __m128i const * ) (dstPel+y*dstStride + x ));
1169
0
          __m128i vDstl = _mm_unpacklo_epi16 (vDst,v0);
1170
0
          __m128i vDsth = _mm_unpackhi_epi16 (vDst,v0);
1171
1172
          //const int32_t p = (b0 + b1 * x + b2 * y + 256) >> 9; // fixed-point plane corrector
1173
0
          __m128i vtmph =_mm_mulhi_epi16(vb1,vx);
1174
0
          __m128i vtmpl =_mm_mullo_epi16(vb1,vx);
1175
0
          __m128i vb1l  = _mm_unpacklo_epi16(vtmpl,vtmph);
1176
0
          __m128i vb1h  = _mm_unpackhi_epi16(vtmpl,vtmph);
1177
0
          vtmph =_mm_mulhi_epi16(vb2,vy);
1178
0
          vtmpl =_mm_mullo_epi16(vb2,vy);
1179
0
          __m128i vb2l  = _mm_unpacklo_epi16(vtmpl,vtmph);
1180
0
          __m128i vb2h  = _mm_unpackhi_epi16(vtmpl,vtmph);
1181
1182
0
          vb1l = _mm_add_epi32(vb1l,vb2l);
1183
0
          vb1h = _mm_add_epi32(vb1h,vb2h);
1184
1185
0
          vb1l = _mm_add_epi32(vb1l,vb0);
1186
0
          vb1h = _mm_add_epi32(vb1h,vb0);
1187
1188
0
          vb1l = _mm_add_epi32(vb1l,v256);
1189
0
          vb1h = _mm_add_epi32(vb1h,v256);
1190
1191
0
          vb1l = _mm_srai_epi32 (vb1l,9);
1192
0
          vb1h = _mm_srai_epi32 (vb1h,9);
1193
0
          vDstl = _mm_sub_epi32 (vDstl,vb1l);
1194
0
          vDsth = _mm_sub_epi32 (vDsth,vb1h);
1195
1196
0
          vDst = _mm_packs_epi32 (vDstl,vDsth);
1197
1198
0
          vDst = _mm_min_epi16( vpelmax, _mm_max_epi16( vpelmin, vDst ) );
1199
0
          _mm_storeu_si128 ((__m128i* )  (dstPel+y*dstStride + x ) ,vDst);
1200
1201
0
          vx = _mm_add_epi16(vx,v8);
1202
0
        }
1203
0
        vy = _mm_add_epi16(vy,v1);
1204
0
      }
1205
0
    }
1206
0
  }
1207
0
}
Unexecuted instantiation: void vvenc::applyPlanarCorrectionSIMD<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, short*, long, int, int, vvenc::ClpRng const&, unsigned short)
Unexecuted instantiation: void vvenc::applyPlanarCorrectionSIMD<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, short*, long, int, int, vvenc::ClpRng const&, unsigned short)
1208
1209
template<X86_VEXT vext>
1210
void applyBlockSIMD( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const ClpRng& clpRng, const Pel **correctedPics, int numRefs, const int *verror, const double *refStrenghts, double weightScaling, double sigmaSq )
1211
0
{
1212
0
  const int         w = blk.width;
1213
0
  const int         h = blk.height;
1214
0
  const int        bx = blk.x;
1215
0
  const int        by = blk.y;
1216
1217
0
  const ptrdiff_t srcStride = src.stride;
1218
0
  const ptrdiff_t dstStride = dst.stride;
1219
1220
0
  const Pel *srcPel = src.bufAt( bx, by );
1221
0
        Pel *dstPel = dst.bufAt( bx, by );
1222
1223
0
  int vnoise[2 * VVENC_MCTF_RANGE] = { 0, };
1224
0
  float vsw [2 * VVENC_MCTF_RANGE] = { 0.0f, };
1225
0
  float vww [2 * VVENC_MCTF_RANGE] = { 0.0f, };
1226
1227
0
  int minError = INT32_MAX;
1228
1229
0
  for( int i = 0; i < numRefs; i++ )
1230
0
  {
1231
0
    const ptrdiff_t refStride = w;
1232
0
    const Pel *     refPel    = correctedPics[i];
1233
0
    __m128i xvar = _mm_setzero_si128(), xdiffsum = _mm_setzero_si128();
1234
1235
    // xvar is a sum of squares of differences of 10bit unsigned values
1236
    //                             \----------------------------------/
1237
    //                                  still 10 bit
1238
    //                  \---------------------------------------------/
1239
    //                              up to 20 bit
1240
    // leaving 12 bit (2^6*2^6) for the sum, which is ok for blocks up to 64x64, with w and h being usually 8 or 16 (2^3 or 2^4)
1241
    // diffsum has double the number of entries, so one less bit
1242
1243
0
    if( ( w & 7 ) == 4 )
1244
0
    {
1245
0
      const __m128i xshufr = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
1246
1247
0
      for( int y1 = 0; y1 < h; y1++ )
1248
0
      {
1249
        // probably only one iteration anyway. the case of w==4 does not occur very often (probably only for chroma, when MCTFUnitSize=8, i.e for res < 720p).
1250
0
        for( int x1 = 0; x1 < w; x1 += 4 )
1251
0
        {
1252
0
          const Pel *pix0 = srcPel + srcStride * y1 + x1;
1253
0
          const Pel *ref0 = refPel + refStride * y1 + x1;
1254
0
          const Pel *pixr = pix0 + 1;
1255
0
          const Pel* refr = ref0 + 1;
1256
0
          const Pel* pixd = pix0 + srcStride;
1257
0
          const Pel* refd = ref0 + refStride;
1258
1259
0
          __m128i xpix0 = _vv_loadl_epi64( ( const __m128i* ) pix0 );
1260
0
          __m128i xref0 = _vv_loadl_epi64( ( const __m128i* ) ref0 );
1261
0
          __m128i xpixr = _vv_loadl_epi64( ( const __m128i* ) pixr );
1262
0
          __m128i xrefr = _vv_loadl_epi64( ( const __m128i* ) refr );
1263
1264
0
          __m128i xdiff = _mm_sub_epi16( xpix0, xref0 );
1265
0
          xvar = _mm_add_epi32( xvar, _mm_madd_epi16( xdiff, xdiff ) );
1266
1267
0
          if( y1 + 1 != h )
1268
0
          {
1269
0
            __m128i xpixd  = _vv_loadl_epi64( ( const __m128i* ) pixd );
1270
0
            __m128i xrefd  = _vv_loadl_epi64( ( const __m128i* ) refd );
1271
0
            __m128i xdiffd = _mm_sub_epi16( xpixd, xrefd );
1272
0
            xdiffd = _mm_sub_epi16( xdiffd, xdiff );
1273
0
            xdiffsum = _mm_add_epi32( xdiffsum, _mm_madd_epi16( xdiffd, xdiffd ) );
1274
0
          }
1275
1276
0
          if( x1 + 4 == w )
1277
0
          {
1278
0
            xpix0 = _mm_shuffle_epi8( xpix0, xshufr );
1279
0
            xpixr = _mm_shuffle_epi8( xpixr, xshufr );
1280
0
            xref0 = _mm_shuffle_epi8( xref0, xshufr );
1281
0
            xrefr = _mm_shuffle_epi8( xrefr, xshufr );
1282
1283
0
            xdiff = _mm_sub_epi16( xpix0, xref0 );
1284
0
          }
1285
1286
0
          __m128i xdiffr = _mm_sub_epi16( xpixr, xrefr );
1287
0
          xdiffr = _mm_sub_epi16( xdiffr, xdiff );
1288
0
          xdiffsum = _mm_add_epi32( xdiffsum, _mm_madd_epi16( xdiffr, xdiffr ) );
1289
0
        }
1290
0
      }
1291
0
    }
1292
0
    else
1293
0
    {
1294
0
      const __m128i xshufr = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 );
1295
1296
0
      for( int y1 = 0; y1 < h; y1++ )
1297
0
      {
1298
0
        for( int x1 = 0; x1 < w; x1 += 8 )
1299
0
        {
1300
0
          const Pel *pix0 = srcPel + srcStride * y1 + x1;
1301
0
          const Pel *ref0 = refPel + refStride * y1 + x1;
1302
0
          const Pel *pixr = pix0 + 1;
1303
0
          const Pel* refr = ref0 + 1;
1304
0
          const Pel* pixd = pix0 + srcStride;
1305
0
          const Pel* refd = ref0 + refStride;
1306
1307
0
          __m128i xpix0 = _mm_loadu_si128( ( const __m128i* ) pix0 );
1308
0
          __m128i xref0 = _mm_loadu_si128( ( const __m128i* ) ref0 );
1309
0
          __m128i xpixr = _mm_loadu_si128( ( const __m128i* ) pixr );
1310
0
          __m128i xrefr = _mm_loadu_si128( ( const __m128i* ) refr );
1311
1312
0
          __m128i xdiff = _mm_sub_epi16( xpix0, xref0 );
1313
0
          xvar = _mm_add_epi32( xvar, _mm_madd_epi16( xdiff, xdiff ) );
1314
1315
0
          if( y1 + 1 != h )
1316
0
          {
1317
0
            __m128i xpixd  = _mm_loadu_si128( ( const __m128i* ) pixd );
1318
0
            __m128i xrefd  = _mm_loadu_si128( ( const __m128i* ) refd );
1319
0
            __m128i xdiffd = _mm_sub_epi16( xpixd, xrefd );
1320
0
            xdiffd = _mm_sub_epi16( xdiffd, xdiff );
1321
0
            xdiffsum = _mm_add_epi32( xdiffsum, _mm_madd_epi16( xdiffd, xdiffd ) );
1322
0
          }
1323
1324
0
          if( x1 + 8 == w )
1325
0
          {
1326
0
            xpix0 = _mm_shuffle_epi8( xpix0, xshufr );
1327
0
            xpixr = _mm_shuffle_epi8( xpixr, xshufr );
1328
0
            xref0 = _mm_shuffle_epi8( xref0, xshufr );
1329
0
            xrefr = _mm_shuffle_epi8( xrefr, xshufr );
1330
1331
0
            xdiff = _mm_sub_epi16( xpix0, xref0 );
1332
0
          }
1333
1334
0
          __m128i xdiffr = _mm_sub_epi16( xpixr, xrefr );
1335
0
          xdiffr = _mm_sub_epi16( xdiffr, xdiff );
1336
0
          xdiffsum = _mm_add_epi32( xdiffsum, _mm_madd_epi16( xdiffr, xdiffr ) );
1337
0
        }
1338
0
      }
1339
0
    }
1340
1341
0
    xvar = _mm_hadd_epi32( xvar, xdiffsum );
1342
0
    xvar = _mm_hadd_epi32( xvar, xvar );
1343
0
    int64_t variance = _mm_cvtsi128_si32( xvar );
1344
0
    int64_t diffsum  = _mm_extract_epi32( xvar, 1 );
1345
0
    variance *= (int64_t) 1 << (2*(10-clpRng.bd));
1346
0
    diffsum  *= (int64_t) 1 << (2*(10-clpRng.bd));
1347
1348
0
    const int cntV = w * h;
1349
0
    const int cntD = 2 * cntV - w - h;
1350
0
    vnoise[i] = ( int ) ( ( ( 15.0 * cntD / cntV * variance + 5.0 ) / ( diffsum + 5.0 ) ) + 0.5 );
1351
0
    minError = std::min( minError, verror[i] );
1352
0
  }
1353
1354
0
  for( int i = 0; i < numRefs; i++ )
1355
0
  {
1356
0
    const int error = verror[i];
1357
0
    const int noise = vnoise[i];
1358
0
    float ww = 1, sw = 1;
1359
0
    ww *= ( noise < 25 ) ? 1.0 : 0.6;
1360
0
    sw *= ( noise < 25 ) ? 1.0 : 0.8;
1361
0
    ww *= ( error < 50 ) ? 1.2 : ( ( error > 100 ) ? 0.6 : 1.0 );
1362
0
    sw *= ( error < 50 ) ? 1.0 : 0.8;
1363
0
    ww *= ( ( minError + 1.0 ) / ( error + 1.0 ) );
1364
1365
0
    vww[i] = ww * weightScaling * refStrenghts[i];
1366
0
    vsw[i] = sw * 2 * sigmaSq;
1367
0
  }
1368
1369
  //inline static float fastExp( float x )
1370
  //{
1371
  //  // using the e^x ~= ( 1 + x/n )^n for n -> inf
1372
  //  float x = 1.0 + x / 1024;
1373
  //  x *= x; x *= x; x *= x; x *= x;
1374
  //  x *= x; x *= x; x *= x; x *= x;
1375
  //  x *= x; x *= x;
1376
  //  return x;
1377
  //}
1378
1379
0
  for( int y = 0; y < h; y++ )
1380
0
  {
1381
0
    for( int x = 0; x < w; x += 4 )
1382
0
    {
1383
0
      __m128i vorgi = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( __m128i* ) ( srcPel + srcStride * y + x ) ) );
1384
0
      __m128  vorg  = _mm_cvtepi32_ps( vorgi );
1385
      //const Pel orgVal  = *( srcPel + srcStride * y + x );
1386
0
      __m128  vtws  = _mm_set1_ps( 1.0f );
1387
      //float temporalWeightSum = 1.0;
1388
      //float newVal = ( float ) orgVal;
1389
0
      __m128  vnewv = vorg;
1390
1391
0
      for( int i = 0; i < numRefs; i++ )
1392
0
      {
1393
0
        const Pel* pCorrectedPelPtr = correctedPics[i] + y * w + x;
1394
0
        __m128i vrefi = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( __m128i* ) pCorrectedPelPtr ) );
1395
        //const int    refVal = *pCorrectedPelPtr;
1396
0
        __m128i vdifi = _mm_sub_epi16( vrefi, vorgi );
1397
        //const int    diff   = refVal - orgVal;
1398
        //const float  diffSq = diff * diff;
1399
0
        __m128i vdsqi = _mm_madd_epi16( vdifi, vdifi );
1400
0
        __m128  vdsq  = _mm_cvtepi32_ps( vdsqi );
1401
1402
        // apply fast exp with 10 iterations!
1403
0
        __m128  vwght = _mm_div_ps( vdsq, _mm_set1_ps( -vsw[i] * 1024.0f ) );
1404
0
        vwght = _mm_add_ps( vwght, _mm_set1_ps( 1.0f ) );
1405
1406
0
        vwght = _mm_mul_ps( vwght, vwght ); //  1
1407
0
        vwght = _mm_mul_ps( vwght, vwght ); //  2
1408
0
        vwght = _mm_mul_ps( vwght, vwght ); //  3
1409
0
        vwght = _mm_mul_ps( vwght, vwght ); //  4
1410
0
        vwght = _mm_mul_ps( vwght, vwght ); //  5
1411
                                       
1412
0
        vwght = _mm_mul_ps( vwght, vwght ); //  6
1413
0
        vwght = _mm_mul_ps( vwght, vwght ); //  7
1414
0
        vwght = _mm_mul_ps( vwght, vwght ); //  8
1415
0
        vwght = _mm_mul_ps( vwght, vwght ); //  9
1416
0
        vwght = _mm_mul_ps( vwght, vwght ); // 10
1417
1418
0
        vwght = _mm_mul_ps( vwght, _mm_set1_ps( vww[i] ) );
1419
        //float weight = vww[i] * fastExp( -diffSq, vsw[i] );
1420
        
1421
0
        vnewv = _mm_add_ps( vnewv, _mm_mul_ps( vwght, _mm_cvtepi32_ps( vrefi ) ) );
1422
        //newVal += weight * refVal;
1423
1424
0
        vtws  = _mm_add_ps( vtws, vwght );
1425
        //temporalWeightSum += weight;
1426
0
      }
1427
1428
0
      vnewv = _mm_div_ps( vnewv, vtws );
1429
      //newVal /= temporalWeightSum;
1430
0
      vnewv = _mm_add_ps( vnewv, _mm_set1_ps( 0.5f ) );
1431
0
      vnewv = _mm_round_ps( vnewv, ( SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_NO_EXC ) );
1432
      //Pel sampleVal = ( Pel ) ( newVal + 0.5 );
1433
0
      __m128i vnewi = _mm_cvtps_epi32( vnewv );
1434
1435
0
      vnewi = _mm_max_epi32( vnewi, _mm_setzero_si128() );
1436
0
      vnewi = _mm_min_epi32( vnewi, _mm_set1_epi32( clpRng.max() ) );
1437
      //sampleVal = ( sampleVal < 0 ? 0 : ( sampleVal > maxSampleValue ? maxSampleValue : sampleVal ) );
1438
      
1439
0
      vnewi = _mm_packs_epi32( vnewi, vnewi );
1440
      //*( dstPel + srcStride * y + x ) = sampleVal;
1441
0
      _vv_storel_epi64( ( __m128i * ) ( dstPel + dstStride * y + x ), vnewi );
1442
0
    }
1443
0
  }
1444
0
}
Unexecuted instantiation: void vvenc::applyBlockSIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::AreaBuf<short const> const&, vvenc::AreaBuf<short>&, vvenc::CompArea const&, vvenc::ClpRng const&, short const**, int, int const*, double const*, double, double)
Unexecuted instantiation: void vvenc::applyBlockSIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::AreaBuf<short const> const&, vvenc::AreaBuf<short>&, vvenc::CompArea const&, vvenc::ClpRng const&, short const**, int, int const*, double const*, double, double)
1445
1446
template<X86_VEXT vext>
1447
double calcVarSse( const Pel* org, const ptrdiff_t origStride, const int w, const int h )
1448
0
{
1449
  // calculate average
1450
0
  __m128i xavg32 = _mm_setzero_si128();
1451
0
  __m128i xavg16 = _mm_setzero_si128();
1452
0
  const __m128i xone = _mm_set1_epi16( 1 );
1453
0
  for( int y1 = 0; y1 < h; y1++ )
1454
0
  {
1455
0
    xavg16 = _mm_setzero_si128();
1456
0
    for( int x1 = 0; x1 < w; x1 += 8 )
1457
0
    {
1458
0
      xavg16 = _mm_add_epi16( xavg16, _mm_loadu_si128( ( const __m128i* ) ( org + x1 + y1 * origStride ) ) );
1459
0
    }
1460
0
    xavg32 = _mm_add_epi32( xavg32, _mm_madd_epi16( xone, xavg16 ) );
1461
0
  }
1462
1463
0
  xavg32 = _mm_hadd_epi32( xavg32, xavg32 );
1464
0
  xavg32 = _mm_hadd_epi32( xavg32, xavg32 );
1465
0
  xavg32 = _mm_shuffle_epi32( xavg32, 0 );
1466
0
  int shift = Log2( w ) + Log2( h ) - 4;
1467
0
  xavg32 = _mm_srai_epi32( xavg32, shift );
1468
0
  xavg16 = _mm_packs_epi32( xavg32, xavg32 );
1469
1470
  // calculate variance
1471
0
  __m128i xvar = _mm_setzero_si128();
1472
0
  for( int y1 = 0; y1 < h; y1++ )
1473
0
  {
1474
0
    for( int x1 = 0; x1 < w; x1 += 8 )
1475
0
    {
1476
0
      __m128i xpix = _mm_loadu_si128( ( const __m128i* ) ( org + x1 + y1 * origStride ) );
1477
0
      xpix = _mm_slli_epi16( xpix, 4 );
1478
0
      xpix = _mm_sub_epi16( xpix, xavg16 );
1479
0
      xpix = _mm_madd_epi16( xpix, xpix );
1480
0
      xvar = _mm_add_epi64( xvar, _mm_cvtepi32_epi64( xpix ) );
1481
0
      xvar = _mm_add_epi64( xvar, _mm_cvtepi32_epi64( _mm_unpackhi_epi64( xpix, xpix ) ) );
1482
0
    }
1483
0
  }
1484
1485
0
  xvar = _mm_add_epi64( xvar, _mm_unpackhi_epi64( xvar, xvar ) );
1486
1487
0
  return _mm_cvtsi128_si64( xvar ) / 256.0;
1488
0
}
Unexecuted instantiation: double vvenc::calcVarSse<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, int, int)
Unexecuted instantiation: double vvenc::calcVarSse<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, int, int)
1489
1490
template<X86_VEXT vext>
1491
void MCTF::_initMCTF_X86()
1492
0
{
1493
0
  m_motionErrorLumaInt8     = motionErrorLumaInt_SIMD<vext>;
1494
0
  m_motionErrorLumaFrac8[0] = motionErrorLumaFrac_SIMD<vext>;
1495
0
  m_motionErrorLumaFrac8[1] = motionErrorLumaFrac_loRes_SIMD<vext>;
1496
1497
0
  m_applyFrac[0][0] = applyFrac6tap_SIMD_8x<vext>;
1498
0
  m_applyFrac[1][0] = applyFrac6tap_SIMD_4x<vext>;
1499
1500
0
  m_applyPlanarCorrection = applyPlanarCorrectionSIMD<vext>;
1501
0
  m_applyBlock            = applyBlockSIMD<vext>;
1502
0
  m_calcVar               = calcVarSse<vext>;
1503
0
}
Unexecuted instantiation: void vvenc::MCTF::_initMCTF_X86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::MCTF::_initMCTF_X86<(vvenc::x86_simd::X86_VEXT)4>()
1504
1505
template
1506
void MCTF::_initMCTF_X86<SIMDX86>();
1507
1508
}
1509
#endif
1510
//! \}