Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/RdCostX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     RdCostX86.cpp
43
    \brief    RD cost computation class, SIMD version
44
*/
45
46
#pragma once
47
48
#include "CommonDefX86.h"
49
#include "Rom.h"
50
#include "RdCost.h"
51
52
#include <math.h>
53
54
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_DIST
55
56
//! \ingroup CommonLib
57
//! \{
58
59
namespace vvenc {
60
61
typedef Pel Torg;
62
typedef Pel Tcur;
63
64
template<X86_VEXT vext >
65
Distortion RdCost::xGetSSE_SIMD( const DistParam &rcDtParam )
66
0
{
67
0
  const Torg* pSrc1     = (const Torg*)rcDtParam.org.buf;
68
0
  const Tcur* pSrc2     = (const Tcur*)rcDtParam.cur.buf;
69
0
  int  iRows            = rcDtParam.org.height;
70
0
  int  iCols            = rcDtParam.org.width;
71
0
  const int iStrideSrc1 = rcDtParam.org.stride;
72
0
  const int iStrideSrc2 = rcDtParam.cur.stride;
73
74
0
  const uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
75
0
  Distortion uiRet = 0;
76
77
0
  if( vext >= AVX2 && ( iCols & 15 ) == 0 )
78
0
  {
79
#ifdef USE_AVX2
80
    __m256i Sum = _mm256_setzero_si256();
81
0
    for( int iY = 0; iY < iRows; iY++ )
82
0
    {
83
0
      for( int iX = 0; iX < iCols; iX+=16 )
84
0
      {
85
0
        __m256i Src1 = ( _mm256_lddqu_si256( ( __m256i* )( &pSrc1[iX] ) ) );
86
0
        __m256i Src2 = ( _mm256_lddqu_si256( ( __m256i* )( &pSrc2[iX] ) ) );
87
0
        __m256i Diff = _mm256_sub_epi16( Src1, Src2 );
88
0
        __m256i Res = _mm256_madd_epi16( Diff, Diff );
89
0
        Sum = _mm256_add_epi32( Sum, Res );
90
0
      }
91
0
      pSrc1   += iStrideSrc1;
92
0
      pSrc2   += iStrideSrc2;
93
0
    }
94
    Sum = _mm256_hadd_epi32( Sum, Sum );
95
    Sum = _mm256_hadd_epi32( Sum, Sum );
96
    uiRet = ( _mm_cvtsi128_si32( _mm256_castsi256_si128( Sum ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( Sum, Sum, 0x11 ) ) ) ) >> uiShift;
97
#endif
98
0
  }
99
0
  else if( ( iCols & 7 ) == 0 )
100
0
  {
101
0
    __m128i Sum = _mm_setzero_si128();
102
0
    for( int iY = 0; iY < iRows; iY++ )
103
0
    {
104
0
      for( int iX = 0; iX < iCols; iX += 8 )
105
0
      {
106
0
        __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128 ( ( const __m128i* )( &pSrc1[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[iX] ) ), _mm_setzero_si128() ) );
107
0
        __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[iX] ) ), _mm_setzero_si128() ) );
108
0
        __m128i Diff = _mm_sub_epi16( Src1, Src2 );
109
0
        __m128i Res = _mm_madd_epi16( Diff, Diff );
110
0
        Sum = _mm_add_epi32( Sum, Res );
111
0
      }
112
0
      pSrc1   += iStrideSrc1;
113
0
      pSrc2   += iStrideSrc2;
114
0
    }
115
0
    Sum = _mm_hadd_epi32( Sum, Sum );
116
0
    Sum = _mm_hadd_epi32( Sum, Sum );
117
0
    uiRet = _mm_cvtsi128_si32( Sum )>>uiShift;
118
0
  }
119
0
  else
120
0
  {
121
0
    __m128i Sum = _mm_setzero_si128();
122
0
    for( int iY = 0; iY < iRows; iY++ )
123
0
    {
124
0
      for( int iX = 0; iX < iCols; iX += 4 )
125
0
      {
126
0
        __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&pSrc1[iX] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&pSrc1[iX] ), _mm_setzero_si128() ) );
127
0
        __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&pSrc2[iX] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&pSrc2[iX] ), _mm_setzero_si128() ) );
128
0
        __m128i Diff = _mm_sub_epi16( Src1, Src2 );
129
0
        __m128i Res = _mm_madd_epi16( Diff, Diff );
130
0
        Sum = _mm_add_epi32( Sum, Res );
131
0
      }
132
0
      pSrc1   += iStrideSrc1;
133
0
      pSrc2   += iStrideSrc2;
134
0
    }
135
0
    Sum = _mm_hadd_epi32( Sum, Sum );
136
0
    uiRet = _mm_cvtsi128_si32( Sum )>>uiShift;
137
0
  }
138
139
0
  return uiRet;
140
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
141
142
143
template<int iWidth, X86_VEXT vext >
144
Distortion RdCost::xGetSSE_NxN_SIMD( const DistParam &rcDtParam )
145
0
{
146
0
  const Torg* pSrc1     = (const Torg*)rcDtParam.org.buf;
147
0
  const Tcur* pSrc2     = (const Tcur*)rcDtParam.cur.buf;
148
0
  int  iRows            = rcDtParam.org.height;
149
0
  const int iStrideSrc1 = rcDtParam.org.stride;
150
0
  const int iStrideSrc2 = rcDtParam.cur.stride;
151
152
0
  const uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
153
0
  Distortion uiRet = 0;
154
155
0
  if( 4 == iWidth )
156
0
  {
157
0
    __m128i Sum = _mm_setzero_si128();
158
0
    for( int iY = 0; iY < iRows; iY++ )
159
0
    {
160
0
      __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )pSrc1 ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)pSrc1 ), _mm_setzero_si128() ) );
161
0
      __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )pSrc2 ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)pSrc2 ), _mm_setzero_si128() ) );
162
0
      pSrc1 += iStrideSrc1;
163
0
      pSrc2 += iStrideSrc2;
164
0
      __m128i Diff = _mm_sub_epi16( Src1, Src2 );
165
0
      __m128i Res  = _mm_madd_epi16( Diff, Diff );
166
0
      Sum = _mm_add_epi32( Sum, Res );
167
0
    }
168
0
    Sum = _mm_hadd_epi32( Sum, Sum );
169
0
    uiRet = _mm_cvtsi128_si32( Sum )>>uiShift;
170
0
  }
171
0
  else
172
0
  {
173
0
    if( vext >= AVX2 && iWidth >= 16 )
174
0
    {
175
#ifdef USE_AVX2
176
      __m256i Sum = _mm256_setzero_si256();
177
0
      for( int iY = 0; iY < iRows; iY++ )
178
0
      {
179
0
        for( int iX = 0; iX < iWidth; iX+=16 )
180
0
        {
181
0
          __m256i Src1 = ( sizeof( Torg ) > 1 ) ? ( _mm256_lddqu_si256( ( __m256i* )( &pSrc1[iX] ) ) ) : ( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_loadu_si128( ( __m128i* )( &pSrc1[iX] ) ) ), 0xD8 ), _mm256_setzero_si256() ) );
182
0
          __m256i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _mm256_lddqu_si256( ( __m256i* )( &pSrc2[iX] ) ) ) : ( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_loadu_si128( ( __m128i* )( &pSrc2[iX] ) ) ), 0xD8 ), _mm256_setzero_si256() ) );
183
0
          __m256i Diff = _mm256_sub_epi16( Src1, Src2 );
184
0
          __m256i Res = _mm256_madd_epi16( Diff, Diff );
185
0
          Sum = _mm256_add_epi32( Sum, Res );
186
0
        }
187
0
        pSrc1   += iStrideSrc1;
188
0
        pSrc2   += iStrideSrc2;
189
0
      }
190
191
      __m256i vzero = _mm256_setzero_si256();
192
      Sum = _mm256_add_epi64( _mm256_unpacklo_epi32( Sum, vzero ), _mm256_unpackhi_epi32( Sum, vzero )); 
193
      Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 14 ) ); 
194
      Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 1 ) ); 
195
      uiRet = _mm_cvtsi128_si64( _mm256_castsi256_si128( Sum ))>>uiShift;
196
#endif
197
0
    }
198
0
    else
199
0
    {
200
0
      __m128i Sum = _mm_setzero_si128();
201
0
      for( int iY = 0; iY < iRows; iY++ )
202
0
      {
203
0
        for( int iX = 0; iX < iWidth; iX+=8 )
204
0
        {
205
0
          __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[iX] ) ), _mm_setzero_si128() ) );
206
0
          __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[iX] ) ), _mm_setzero_si128() ) );
207
0
          __m128i Diff = _mm_sub_epi16( Src1, Src2 );
208
0
          __m128i Res = _mm_madd_epi16( Diff, Diff );
209
0
          Sum = _mm_add_epi32( Sum, Res );
210
0
        }
211
0
        pSrc1 += iStrideSrc1;
212
0
        pSrc2 += iStrideSrc2;
213
0
      }
214
215
0
      __m128i vzero = _mm_setzero_si128();
216
0
      Sum = _mm_add_epi64( _mm_unpacklo_epi32( Sum, vzero ), _mm_unpackhi_epi32( Sum, vzero ));
217
0
      uiRet = (_mm_cvtsi128_si64( Sum ) + _mm_extract_epi64( Sum, 1 ))>>uiShift;
218
0
    }
219
0
  }
220
221
0
  return uiRet;
222
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
223
224
template< X86_VEXT vext >
225
Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam )
226
0
{
227
0
  if( rcDtParam.org.width < 4 )
228
0
    return RdCost::xGetSAD( rcDtParam );
229
230
0
  const short* pSrc1   = (const short*)rcDtParam.org.buf;
231
0
  const short* pSrc2   = (const short*)rcDtParam.cur.buf;
232
0
  int  iRows           = rcDtParam.org.height;
233
0
  int  iCols           = rcDtParam.org.width;
234
0
  int  iSubShift       = rcDtParam.subShift;
235
0
  int  iSubStep        = ( 1 << iSubShift );
236
0
  const int iStrideSrc1 = rcDtParam.org.stride * iSubStep;
237
0
  const int iStrideSrc2 = rcDtParam.cur.stride * iSubStep;
238
239
0
  uint32_t uiSum = 0;
240
0
  if( vext >= AVX2 && ( iCols & 15 ) == 0 )
241
0
  {
242
#ifdef USE_AVX2
243
    // Do for width that multiple of 16
244
    __m256i vzero = _mm256_setzero_si256();
245
    __m256i vsum32 = vzero;
246
0
    for( int iY = 0; iY < iRows; iY+=iSubStep )
247
0
    {
248
0
      __m256i vsum16 = vzero;
249
0
      for( int iX = 0; iX < iCols; iX+=16 )
250
0
      {
251
0
        __m256i vsrc1 = _mm256_lddqu_si256( ( __m256i* )( &pSrc1[iX] ) );
252
0
        __m256i vsrc2 = _mm256_lddqu_si256( ( __m256i* )( &pSrc2[iX] ) );
253
0
        vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
254
0
      }
255
0
      __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vzero ), _mm256_unpackhi_epi16( vsum16, vzero ) );
256
0
      vsum32 = _mm256_add_epi32( vsum32, vsumtemp );
257
0
      pSrc1   += iStrideSrc1;
258
0
      pSrc2   += iStrideSrc2;
259
0
    }
260
    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
261
    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
262
    uiSum =  _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) );
263
#endif
264
0
  }
265
0
  else if( ( iCols & 7 ) == 0 )
266
0
  {
267
    // Do with step of 8
268
0
    __m128i vzero = _mm_setzero_si128();
269
0
    __m128i vsum32 = vzero;
270
0
    for( int iY = 0; iY < iRows; iY+=iSubStep )
271
0
    {
272
0
      __m128i vsum16 = vzero;
273
0
      for( int iX = 0; iX < iCols; iX+=8 )
274
0
      {
275
0
        __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX] ) );
276
0
        __m128i vsrc2 = _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) );
277
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
278
0
      }
279
0
      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
280
0
      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
281
0
      pSrc1   += iStrideSrc1;
282
0
      pSrc2   += iStrideSrc2;
283
0
    }
284
0
    vsum32 = _mm_hadd_epi32( vsum32, vzero );
285
0
    vsum32 = _mm_hadd_epi32( vsum32, vzero );
286
0
    uiSum  =  _mm_cvtsi128_si32( vsum32 );
287
0
  }
288
0
  else
289
0
  {
290
    // Do with step of 4
291
0
    CHECK( ( iCols & 3 ) != 0, "Not divisible by 4: " << iCols );
292
0
    __m128i vzero = _mm_setzero_si128();
293
0
    __m128i vsum32 = vzero;
294
0
    for( int iY = 0; iY < iRows; iY += iSubStep )
295
0
    {
296
0
      __m128i vsum16 = vzero;
297
0
      for( int iX = 0; iX < iCols; iX+=4 )
298
0
      {
299
0
        __m128i vsrc1 = _vv_loadl_epi64( ( const __m128i* )&pSrc1[iX] );
300
0
        __m128i vsrc2 = _vv_loadl_epi64( ( const __m128i* )&pSrc2[iX] );
301
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
302
0
      }
303
0
      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
304
0
      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
305
0
      pSrc1 += iStrideSrc1;
306
0
      pSrc2 += iStrideSrc2;
307
0
    }
308
0
    vsum32 = _mm_hadd_epi32( vsum32, vzero );
309
0
    vsum32 = _mm_hadd_epi32( vsum32, vzero );
310
0
    uiSum  = _mm_cvtsi128_si32( vsum32 );
311
0
  }
312
313
0
  uiSum <<= iSubShift;
314
0
  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
315
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
316
317
318
template< int iWidth, X86_VEXT vext >
319
Distortion RdCost::xGetSAD_NxN_SIMD( const DistParam &rcDtParam )
320
0
{
321
  //  assert( rcDtParam.iCols == iWidth);
322
0
  const short* pSrc1   = (const short*)rcDtParam.org.buf;
323
0
  const short* pSrc2   = (const short*)rcDtParam.cur.buf;
324
0
  int  iRows           = rcDtParam.org.height;
325
0
  int  iSubShift       = rcDtParam.subShift;
326
0
  int  iSubStep        = ( 1 << iSubShift );
327
0
  const int iStrideSrc1 = rcDtParam.org.stride * iSubStep;
328
0
  const int iStrideSrc2 = rcDtParam.cur.stride * iSubStep;
329
330
0
  uint32_t uiSum = 0;
331
332
0
  if( iWidth == 4 )
333
0
  {
334
0
    if( iRows == 4 && iSubShift == 0 )
335
0
    {
336
0
      __m128i vzero = _mm_setzero_si128();
337
0
      __m128i vsrc1 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )pSrc1 ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[iStrideSrc1] ) ), 8 ) );
338
0
      __m128i vsrc2 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )pSrc2 ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[iStrideSrc2] ) ), 8 ) );
339
0
      __m128i vsum  = _mm_cvtepi16_epi32( _mm_hadd_epi16( _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ), vzero ) );
340
341
0
      vsrc1 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[2 * iStrideSrc1] ) ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[3 * iStrideSrc1] ) ), 8 ) );
342
0
      vsrc2 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[2 * iStrideSrc2] ) ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[3 * iStrideSrc2] ) ), 8 ) );
343
0
      vsum  = _mm_add_epi32( vsum, _mm_cvtepi16_epi32( _mm_hadd_epi16( _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ), vzero ) ) );
344
0
      vsum  = _mm_hadd_epi32( vsum, vzero );
345
0
      vsum  = _mm_hadd_epi32( vsum, vzero );
346
347
0
      uiSum = _mm_cvtsi128_si32( vsum );
348
0
    }
349
0
    else
350
0
    {
351
0
      __m128i vone = _mm_set1_epi16( 1 );
352
0
      __m128i vsum32 = _mm_setzero_si128();
353
0
      for( int iY = 0; iY < iRows; iY += iSubStep )
354
0
      {
355
0
        __m128i vsrc1 = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( const __m128i* )pSrc1 ) );
356
0
        __m128i vsrc2 = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( const __m128i* )pSrc2 ) );
357
0
        vsum32 = _mm_add_epi32( vsum32, _mm_abs_epi32( _mm_sub_epi32( vsrc1, vsrc2 ) ) );
358
359
0
        pSrc1 += iStrideSrc1;
360
0
        pSrc2 += iStrideSrc2;
361
0
      }
362
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
363
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
364
0
      uiSum = _mm_cvtsi128_si32( vsum32 );
365
0
    }
366
0
  }
367
0
  else
368
0
  {
369
#ifdef USE_AVX2
370
0
    if( vext >= AVX2 && iWidth >= 16 )
371
0
    {
372
0
      static constexpr bool earlyExitAllowed = iWidth >= 64;
373
      // Do for width that multiple of 16
374
0
      __m256i vone   = _mm256_set1_epi16( 1 );
375
0
      __m256i vsum32 = _mm256_setzero_si256();
376
377
      int checkExit = 3;
378
379
0
      for( int iY = 0; iY < iRows; iY+=iSubStep )
380
0
      {
381
0
        __m256i vsrc1  = _mm256_loadu_si256( ( __m256i* )( pSrc1 ) );
382
0
        __m256i vsrc2  = _mm256_loadu_si256( ( __m256i* )( pSrc2 ) );
383
0
        __m256i vsum16 = _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) );
384
385
0
        for( int iX = 16; iX < iWidth; iX+=16 )
386
0
        {
387
0
          vsrc1  = _mm256_loadu_si256( ( __m256i* )( &pSrc1[iX] ) );
388
0
          vsrc2  = _mm256_loadu_si256( ( __m256i* )( &pSrc2[iX] ) );
389
0
          vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
390
0
        }
391
392
0
        __m256i vsumtemp = _mm256_madd_epi16( vsum16, vone );
393
0
        if( earlyExitAllowed ) vsum32 = _mm256_hadd_epi32( vsum32, vsumtemp );
394
0
        else                   vsum32 = _mm256_add_epi32 ( vsum32, vsumtemp );
395
396
0
        pSrc1   += iStrideSrc1;
397
0
        pSrc2   += iStrideSrc2;
398
399
0
        if( earlyExitAllowed && checkExit == 0 )
400
0
        {
401
0
          Distortion distTemp = _mm256_extract_epi32( vsum32, 0 ) + _mm256_extract_epi32( vsum32, 4 );
402
0
          distTemp <<= iSubShift;
403
0
          distTemp >>= DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth );
404
0
          if( distTemp > rcDtParam.maximumDistortionForEarlyExit ) return distTemp;
405
0
          checkExit = 3;
406
0
        }
407
0
        else if( earlyExitAllowed )
408
0
        {
409
0
          checkExit--;
410
0
        }
411
0
      }
412
413
0
      __m128i
414
0
      xsum32 = _mm_add_epi32( _mm256_castsi256_si128( vsum32 ), _mm256_extracti128_si256( vsum32, 1 ) );
415
0
      xsum32 = _mm_hadd_epi32( xsum32, xsum32 );
416
0
      xsum32 = _mm_hadd_epi32( xsum32, xsum32 );
417
0
      uiSum  = _mm_cvtsi128_si32( xsum32 );
418
0
    }
419
0
    else
420
0
#endif
421
0
    if( iRows == 16 && ( iWidth == 16 || iWidth == 8 ) && iSubShift == 1 && rcDtParam.bitDepth <= 10 )
422
0
    {
423
0
      static constexpr bool isWdt16 = iWidth >= 16;
424
425
0
      __m128i vone   = _mm_set1_epi16( 1 );
426
0
      __m128i vsum32 = _mm_setzero_si128();
427
428
0
      for( int i = 0; i < 2; i++ )
429
0
      {
430
        //0
431
0
        __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
432
0
        __m128i vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
433
434
0
        __m128i vsum16 = _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) );
435
436
0
        if( isWdt16 )
437
0
        {
438
0
          vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
439
0
          vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
440
441
0
          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
442
0
        }
443
444
0
        pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
445
446
        // 1
447
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
448
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
449
450
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
451
452
0
        if( isWdt16 )
453
0
        {
454
0
          vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
455
0
          vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
456
457
0
          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
458
0
        }
459
460
0
        pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
461
462
        // 2
463
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
464
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
465
466
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
467
468
0
        if( isWdt16 )
469
0
        {
470
0
          vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
471
0
          vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
472
473
0
          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
474
0
        }
475
476
0
        pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
477
478
        // 3
479
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
480
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
481
482
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
483
484
0
        if( isWdt16 )
485
0
        {
486
0
          vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
487
0
          vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
488
489
0
          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
490
0
        }
491
492
0
        pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
493
494
0
        vsum32 = _mm_add_epi32( vsum32, _mm_madd_epi16( vsum16, vone ) );
495
0
      }
496
497
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
498
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
499
0
      uiSum = _mm_cvtsi128_si32( vsum32 );
500
501
0
      uiSum <<= 1;
502
0
      return uiSum >> DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth );
503
0
    }
504
0
    else
505
0
    {
506
0
      static constexpr bool earlyExitAllowed = iWidth >= 64;
507
508
      // For width that multiple of 8
509
0
      __m128i vone   = _mm_set1_epi16( 1 );
510
0
      __m128i vsum32 = _mm_setzero_si128();
511
512
0
      int checkExit = 3;
513
514
0
      for( int iY = 0; iY < iRows; iY+=iSubStep )
515
0
      {
516
0
        __m128i vsrc1  = _mm_loadu_si128( ( const __m128i* )( pSrc1 ) );
517
0
        __m128i vsrc2  = _mm_loadu_si128( ( const __m128i* )( pSrc2 ) );
518
0
        __m128i vsum16 = _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) );
519
520
0
        if( iWidth >= 16 )
521
0
        {
522
0
          vsrc1  = _mm_loadu_si128( ( const __m128i* )( &pSrc1[8] ) );
523
0
          vsrc2  = _mm_loadu_si128( ( const __m128i* )( &pSrc2[8] ) );
524
0
          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
525
526
0
          for( int iX = 16; iX < iWidth; iX += 16 )
527
0
          {
528
0
            vsrc1  = _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX] ) );
529
0
            vsrc2  = _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) );
530
0
            vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
531
            
532
0
            vsrc1  = _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX + 8] ) );
533
0
            vsrc2  = _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX + 8] ) );
534
0
            vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
535
0
          }
536
0
        }
537
538
0
        __m128i vsumtemp = _mm_madd_epi16( vsum16, vone );
539
0
        if( earlyExitAllowed ) vsum32 = _mm_hadd_epi32( vsum32, vsumtemp );
540
0
        else                   vsum32 = _mm_add_epi32 ( vsum32, vsumtemp );
541
542
0
        pSrc1   += iStrideSrc1;
543
0
        pSrc2   += iStrideSrc2;
544
545
0
        if( earlyExitAllowed && checkExit == 0 )
546
0
        {
547
0
          Distortion distTemp = _mm_cvtsi128_si32( vsum32 );
548
0
          distTemp <<= iSubShift;
549
0
          distTemp >>= DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth );
550
0
          if( distTemp > rcDtParam.maximumDistortionForEarlyExit ) return distTemp;
551
0
          checkExit = 3;
552
0
        }
553
0
        else if( earlyExitAllowed )
554
0
        {
555
0
          checkExit--;
556
0
        }
557
0
      }
558
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
559
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
560
0
      uiSum =  _mm_cvtsi128_si32( vsum32 );
561
0
    }
562
0
  }
563
564
0
  uiSum <<= iSubShift;
565
0
  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
566
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
567
568
static uint32_t xCalcHAD4x4_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur )
569
0
{
570
0
  __m128i r0 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[0] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[0] ), _mm_setzero_si128() ) );
571
0
  __m128i r1 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[iStrideOrg] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[iStrideOrg] ), _mm_setzero_si128() ) );
572
0
  __m128i r2 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[2 * iStrideOrg] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[2 * iStrideOrg] ), _mm_setzero_si128() ) );
573
0
  __m128i r3 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[3 * iStrideOrg] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[3 * iStrideOrg] ), _mm_setzero_si128() ) );
574
0
  __m128i r4 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[0] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[0] ), _mm_setzero_si128() ) );
575
0
  __m128i r5 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[iStrideCur] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[iStrideCur] ), _mm_setzero_si128() ) );
576
0
  __m128i r6 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[2 * iStrideCur] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[2 * iStrideCur] ), _mm_setzero_si128() ) );
577
0
  __m128i r7 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[3 * iStrideCur] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[3 * iStrideCur] ), _mm_setzero_si128() ) );
578
579
0
  r0 = _mm_sub_epi16( r0, r4 );
580
0
  r1 = _mm_sub_epi16( r1, r5 );
581
0
  r2 = _mm_sub_epi16( r2, r6 );
582
0
  r3 = _mm_sub_epi16( r3, r7 );
583
584
  // first stage
585
0
  r4 = r0;
586
0
  r5 = r1;
587
588
0
  r0 = _mm_add_epi16( r0, r3 );
589
0
  r1 = _mm_add_epi16( r1, r2 );
590
591
0
  r4 = _mm_sub_epi16( r4, r3 );
592
0
  r5 = _mm_sub_epi16( r5, r2 );
593
594
0
  r2 = r0;
595
0
  r3 = r4;
596
597
0
  r0 = _mm_add_epi16( r0, r1 );
598
0
  r2 = _mm_sub_epi16( r2, r1 );
599
0
  r3 = _mm_sub_epi16( r3, r5 );
600
0
  r5 = _mm_add_epi16( r5, r4 );
601
602
  // shuffle - flip matrix for vertical transform
603
0
  r0 = _mm_unpacklo_epi16( r0, r5 );
604
0
  r2 = _mm_unpacklo_epi16( r2, r3 );
605
606
0
  r3 = r0;
607
0
  r0 = _mm_unpacklo_epi32( r0, r2 );
608
0
  r3 = _mm_unpackhi_epi32( r3, r2 );
609
610
0
  r1 = r0;
611
0
  r2 = r3;
612
0
  r1 = _mm_srli_si128( r1, 8 );
613
0
  r3 = _mm_srli_si128( r3, 8 );
614
615
  // second stage
616
0
  r4 = r0;
617
0
  r5 = r1;
618
619
0
  r0 = _mm_add_epi16( r0, r3 );
620
0
  r1 = _mm_add_epi16( r1, r2 );
621
622
0
  r4 = _mm_sub_epi16( r4, r3 );
623
0
  r5 = _mm_sub_epi16( r5, r2 );
624
625
0
  r2 = r0;
626
0
  r3 = r4;
627
628
0
  r0 = _mm_add_epi16( r0, r1 );
629
0
  r2 = _mm_sub_epi16( r2, r1 );
630
0
  r3 = _mm_sub_epi16( r3, r5 );
631
0
  r5 = _mm_add_epi16( r5, r4 );
632
633
  // abs
634
0
  __m128i Sum = _mm_abs_epi16( r0 );
635
0
  uint32_t absDc = _mm_cvtsi128_si32( Sum ) & 0x0000ffff;
636
0
  Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r2 ) );
637
0
  Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r3 ) );
638
0
  Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r5 ) );
639
640
0
  __m128i iZero = _mm_set1_epi16( 0 );
641
0
  Sum = _mm_unpacklo_epi16( Sum, iZero );
642
0
  Sum = _mm_hadd_epi32( Sum, Sum );
643
0
  Sum = _mm_hadd_epi32( Sum, Sum );
644
645
0
  uint32_t sad = _mm_cvtsi128_si32( Sum );
646
  
647
0
  sad -= absDc;
648
0
  sad += absDc >> 2;
649
0
  sad = ( ( sad + 1 ) >> 1 );
650
651
0
  return sad;
652
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD4x4_SSE(short const*, short const*, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD4x4_SSE(short const*, short const*, int, int)
653
654
//working up to 12-bit
655
static uint32_t xCalcHAD8x8_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
656
0
{
657
0
  __m128i m1[2][8], m2[2][8];
658
659
0
  CHECK( iBitDepth > 10, "Only bit-depths of up to 10 bits supported!" );
660
661
0
  for( int k = 0; k < 8; k++ )
662
0
  {
663
0
    __m128i r0 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128( ( __m128i* )piOrg ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )piOrg ), _mm_setzero_si128() ) );
664
0
    __m128i r1 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadu_si128( ( __m128i* )piCur ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )piCur ), _mm_setzero_si128() ) );
665
0
    m2[0][k] = _mm_sub_epi16( r0, r1 ); // 11bit
666
0
    piCur += iStrideCur;
667
0
    piOrg += iStrideOrg;
668
0
  }
669
670
  //horizontal
671
0
  m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][4] );
672
0
  m1[0][1] = _mm_add_epi16( m2[0][1], m2[0][5] );
673
0
  m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][6] );
674
0
  m1[0][3] = _mm_add_epi16( m2[0][3], m2[0][7] );
675
0
  m1[0][4] = _mm_sub_epi16( m2[0][0], m2[0][4] );
676
0
  m1[0][5] = _mm_sub_epi16( m2[0][1], m2[0][5] );
677
0
  m1[0][6] = _mm_sub_epi16( m2[0][2], m2[0][6] );
678
0
  m1[0][7] = _mm_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit
679
680
0
  m2[0][0] = _mm_add_epi16( m1[0][0], m1[0][2] );
681
0
  m2[0][1] = _mm_add_epi16( m1[0][1], m1[0][3] );
682
0
  m2[0][2] = _mm_sub_epi16( m1[0][0], m1[0][2] );
683
0
  m2[0][3] = _mm_sub_epi16( m1[0][1], m1[0][3] );
684
0
  m2[0][4] = _mm_add_epi16( m1[0][4], m1[0][6] );
685
0
  m2[0][5] = _mm_add_epi16( m1[0][5], m1[0][7] );
686
0
  m2[0][6] = _mm_sub_epi16( m1[0][4], m1[0][6] );
687
0
  m2[0][7] = _mm_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit
688
689
0
  m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][1] );
690
0
  m1[0][1] = _mm_sub_epi16( m2[0][0], m2[0][1] );
691
0
  m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][3] );
692
0
  m1[0][3] = _mm_sub_epi16( m2[0][2], m2[0][3] );
693
0
  m1[0][4] = _mm_add_epi16( m2[0][4], m2[0][5] );
694
0
  m1[0][5] = _mm_sub_epi16( m2[0][4], m2[0][5] );
695
0
  m1[0][6] = _mm_add_epi16( m2[0][6], m2[0][7] );
696
0
  m1[0][7] = _mm_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit
697
698
0
  m2[0][0] = _mm_unpacklo_epi16( m1[0][0], m1[0][1] );
699
0
  m2[0][1] = _mm_unpacklo_epi16( m1[0][2], m1[0][3] );
700
0
  m2[0][2] = _mm_unpackhi_epi16( m1[0][0], m1[0][1] );
701
0
  m2[0][3] = _mm_unpackhi_epi16( m1[0][2], m1[0][3] );
702
0
  m2[0][4] = _mm_unpacklo_epi16( m1[0][4], m1[0][5] );
703
0
  m2[0][5] = _mm_unpacklo_epi16( m1[0][6], m1[0][7] );
704
0
  m2[0][6] = _mm_unpackhi_epi16( m1[0][4], m1[0][5] );
705
0
  m2[0][7] = _mm_unpackhi_epi16( m1[0][6], m1[0][7] );
706
707
0
  m1[0][0] = _mm_unpacklo_epi32( m2[0][0], m2[0][1] );
708
0
  m1[0][1] = _mm_unpackhi_epi32( m2[0][0], m2[0][1] );
709
0
  m1[0][2] = _mm_unpacklo_epi32( m2[0][2], m2[0][3] );
710
0
  m1[0][3] = _mm_unpackhi_epi32( m2[0][2], m2[0][3] );
711
0
  m1[0][4] = _mm_unpacklo_epi32( m2[0][4], m2[0][5] );
712
0
  m1[0][5] = _mm_unpackhi_epi32( m2[0][4], m2[0][5] );
713
0
  m1[0][6] = _mm_unpacklo_epi32( m2[0][6], m2[0][7] );
714
0
  m1[0][7] = _mm_unpackhi_epi32( m2[0][6], m2[0][7] );
715
  
716
0
  m1[1][0] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][0], 8 ) );
717
0
  m1[0][0] = _mm_cvtepi16_epi32(                 m1[0][0]      );
718
0
  m1[1][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][1], 8 ) );
719
0
  m1[0][1] = _mm_cvtepi16_epi32(                 m1[0][1]      );
720
0
  m1[1][2] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][2], 8 ) );
721
0
  m1[0][2] = _mm_cvtepi16_epi32(                 m1[0][2]      );
722
0
  m1[1][3] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][3], 8 ) );
723
0
  m1[0][3] = _mm_cvtepi16_epi32(                 m1[0][3]      );
724
0
  m1[1][4] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][4], 8 ) );
725
0
  m1[0][4] = _mm_cvtepi16_epi32(                 m1[0][4]      );
726
0
  m1[1][5] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][5], 8 ) );
727
0
  m1[0][5] = _mm_cvtepi16_epi32(                 m1[0][5]      );
728
0
  m1[1][6] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][6], 8 ) );
729
0
  m1[0][6] = _mm_cvtepi16_epi32(                 m1[0][6]      );
730
0
  m1[1][7] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][7], 8 ) );
731
0
  m1[0][7] = _mm_cvtepi16_epi32(                 m1[0][7]      );
732
733
0
  for( int i = 0; i < 8; i++ )
734
0
  {
735
0
    int ii = i % 4;
736
0
    int ij = i >> 2;
737
738
0
    m2[0][i] = m1[ij][ii    ];
739
0
    m2[1][i] = m1[ij][ii + 4];
740
0
  }
741
742
0
  for( int i = 0; i < 2; i++ )
743
0
  {
744
0
    m1[i][0] = _mm_add_epi32( m2[i][0], m2[i][4] );
745
0
    m1[i][1] = _mm_add_epi32( m2[i][1], m2[i][5] );
746
0
    m1[i][2] = _mm_add_epi32( m2[i][2], m2[i][6] );
747
0
    m1[i][3] = _mm_add_epi32( m2[i][3], m2[i][7] );
748
0
    m1[i][4] = _mm_sub_epi32( m2[i][0], m2[i][4] );
749
0
    m1[i][5] = _mm_sub_epi32( m2[i][1], m2[i][5] );
750
0
    m1[i][6] = _mm_sub_epi32( m2[i][2], m2[i][6] );
751
0
    m1[i][7] = _mm_sub_epi32( m2[i][3], m2[i][7] );
752
753
0
    m2[i][0] = _mm_add_epi32( m1[i][0], m1[i][2] );
754
0
    m2[i][1] = _mm_add_epi32( m1[i][1], m1[i][3] );
755
0
    m2[i][2] = _mm_sub_epi32( m1[i][0], m1[i][2] );
756
0
    m2[i][3] = _mm_sub_epi32( m1[i][1], m1[i][3] );
757
0
    m2[i][4] = _mm_add_epi32( m1[i][4], m1[i][6] );
758
0
    m2[i][5] = _mm_add_epi32( m1[i][5], m1[i][7] );
759
0
    m2[i][6] = _mm_sub_epi32( m1[i][4], m1[i][6] );
760
0
    m2[i][7] = _mm_sub_epi32( m1[i][5], m1[i][7] );
761
762
0
    m1[i][0] = _mm_abs_epi32( _mm_add_epi32( m2[i][0], m2[i][1] ) );
763
0
    m1[i][1] = _mm_abs_epi32( _mm_sub_epi32( m2[i][0], m2[i][1] ) );
764
0
    m1[i][2] = _mm_abs_epi32( _mm_add_epi32( m2[i][2], m2[i][3] ) );
765
0
    m1[i][3] = _mm_abs_epi32( _mm_sub_epi32( m2[i][2], m2[i][3] ) );
766
0
    m1[i][4] = _mm_abs_epi32( _mm_add_epi32( m2[i][4], m2[i][5] ) );
767
0
    m1[i][5] = _mm_abs_epi32( _mm_sub_epi32( m2[i][4], m2[i][5] ) );
768
0
    m1[i][6] = _mm_abs_epi32( _mm_add_epi32( m2[i][6], m2[i][7] ) );
769
0
    m1[i][7] = _mm_abs_epi32( _mm_sub_epi32( m2[i][6], m2[i][7] ) );
770
0
  }
771
0
  m2[0][0] = m1[0][0];
772
0
  for( int i = 0; i < 8; i++ )
773
0
  {
774
0
    m1[0][i] = _mm_add_epi32( m1[0][i], m1[1][i] );
775
0
  }
776
777
0
  m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][1] );
778
0
  m1[0][2] = _mm_add_epi32( m1[0][2], m1[0][3] );
779
0
  m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][5] );
780
0
  m1[0][6] = _mm_add_epi32( m1[0][6], m1[0][7] );
781
782
0
  m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][2] );
783
0
  m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][6] );
784
0
  __m128i iSum = _mm_add_epi32( m1[0][0], m1[0][4] );
785
786
0
  iSum = _mm_hadd_epi32( iSum, iSum );
787
0
  iSum = _mm_hadd_epi32( iSum, iSum );
788
789
0
  uint32_t sad = _mm_cvtsi128_si32( iSum );
790
0
  uint32_t absDc = _mm_cvtsi128_si32( m2[0][0] );
791
0
  sad -= absDc;
792
0
  sad += absDc >> 2;
793
0
  sad = ( ( sad + 2 ) >> 2 );
794
795
0
  return sad;
796
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD8x8_SSE(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD8x8_SSE(short const*, short const*, int, int, int)
797
798
799
//working up to 12-bit
800
static uint32_t xCalcHAD16x16_fast_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
801
0
{
802
0
  __m128i m1[2][8], m2[2][8];
803
804
0
  CHECK( iBitDepth > 10, "Only bit-depths of up to 10 bits supported!" );
805
806
0
  for( int k = 0; k < 8; k++ )
807
0
  {
808
0
    __m128i r0 = _mm_loadu_si128( ( __m128i* )piOrg );
809
0
    __m128i r1 = _mm_loadu_si128( ( __m128i* )piCur );
810
0
    __m128i r2 = _mm_loadu_si128( ( __m128i* )( piOrg + iStrideOrg ) );
811
0
    __m128i r3 = _mm_loadu_si128( ( __m128i* )( piCur + iStrideCur ) );
812
813
0
    r0 = _mm_add_epi16( r0, r2 );
814
0
    r1 = _mm_add_epi16( r1, r3 );
815
816
0
    r2 = _mm_loadu_si128( ( __m128i* )( piOrg + 8 ) );
817
0
    r3 = _mm_loadu_si128( ( __m128i* )( piCur + 8 ) );
818
0
    __m128i r4 = _mm_loadu_si128( ( __m128i* )( piOrg + iStrideOrg + 8 ) );
819
0
    __m128i r5 = _mm_loadu_si128( ( __m128i* )( piCur + iStrideCur + 8 ) );
820
821
0
    r2 = _mm_add_epi16( r2, r4 );
822
0
    r3 = _mm_add_epi16( r3, r5 );
823
824
0
    r0 = _mm_hadd_epi16( r0, r2 );
825
0
    r1 = _mm_hadd_epi16( r1, r3 );
826
827
0
    r0 = _mm_add_epi16( r0, _mm_set1_epi16( 2 ) );
828
0
    r1 = _mm_add_epi16( r1, _mm_set1_epi16( 2 ) );
829
0
    r0 = _mm_srai_epi16( r0, 2 );
830
0
    r1 = _mm_srai_epi16( r1, 2 );
831
832
0
    m2[0][k] = _mm_sub_epi16( r0, r1 ); // 11bit
833
    //m2[1][k] = _mm_cvtepi16_epi32( _mm_srli_si128( m2[0][k], 8 ) );
834
    //m2[0][k] = _mm_cvtepi16_epi32( m2[0][k] );
835
0
    piCur += iStrideCur * 2;
836
0
    piOrg += iStrideOrg * 2;
837
0
  }
838
839
  //horizontal
840
0
  m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][4] );
841
0
  m1[0][1] = _mm_add_epi16( m2[0][1], m2[0][5] );
842
0
  m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][6] );
843
0
  m1[0][3] = _mm_add_epi16( m2[0][3], m2[0][7] );
844
0
  m1[0][4] = _mm_sub_epi16( m2[0][0], m2[0][4] );
845
0
  m1[0][5] = _mm_sub_epi16( m2[0][1], m2[0][5] );
846
0
  m1[0][6] = _mm_sub_epi16( m2[0][2], m2[0][6] );
847
0
  m1[0][7] = _mm_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit
848
849
0
  m2[0][0] = _mm_add_epi16( m1[0][0], m1[0][2] );
850
0
  m2[0][1] = _mm_add_epi16( m1[0][1], m1[0][3] );
851
0
  m2[0][2] = _mm_sub_epi16( m1[0][0], m1[0][2] );
852
0
  m2[0][3] = _mm_sub_epi16( m1[0][1], m1[0][3] );
853
0
  m2[0][4] = _mm_add_epi16( m1[0][4], m1[0][6] );
854
0
  m2[0][5] = _mm_add_epi16( m1[0][5], m1[0][7] );
855
0
  m2[0][6] = _mm_sub_epi16( m1[0][4], m1[0][6] );
856
0
  m2[0][7] = _mm_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit
857
858
0
  m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][1] );
859
0
  m1[0][1] = _mm_sub_epi16( m2[0][0], m2[0][1] );
860
0
  m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][3] );
861
0
  m1[0][3] = _mm_sub_epi16( m2[0][2], m2[0][3] );
862
0
  m1[0][4] = _mm_add_epi16( m2[0][4], m2[0][5] );
863
0
  m1[0][5] = _mm_sub_epi16( m2[0][4], m2[0][5] );
864
0
  m1[0][6] = _mm_add_epi16( m2[0][6], m2[0][7] );
865
0
  m1[0][7] = _mm_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit
866
867
0
  m2[0][0] = _mm_unpacklo_epi16( m1[0][0], m1[0][1] );
868
0
  m2[0][1] = _mm_unpacklo_epi16( m1[0][2], m1[0][3] );
869
0
  m2[0][2] = _mm_unpackhi_epi16( m1[0][0], m1[0][1] );
870
0
  m2[0][3] = _mm_unpackhi_epi16( m1[0][2], m1[0][3] );
871
0
  m2[0][4] = _mm_unpacklo_epi16( m1[0][4], m1[0][5] );
872
0
  m2[0][5] = _mm_unpacklo_epi16( m1[0][6], m1[0][7] );
873
0
  m2[0][6] = _mm_unpackhi_epi16( m1[0][4], m1[0][5] );
874
0
  m2[0][7] = _mm_unpackhi_epi16( m1[0][6], m1[0][7] );
875
876
0
  m1[0][0] = _mm_unpacklo_epi32( m2[0][0], m2[0][1] );
877
0
  m1[0][1] = _mm_unpackhi_epi32( m2[0][0], m2[0][1] );
878
0
  m1[0][2] = _mm_unpacklo_epi32( m2[0][2], m2[0][3] );
879
0
  m1[0][3] = _mm_unpackhi_epi32( m2[0][2], m2[0][3] );
880
0
  m1[0][4] = _mm_unpacklo_epi32( m2[0][4], m2[0][5] );
881
0
  m1[0][5] = _mm_unpackhi_epi32( m2[0][4], m2[0][5] );
882
0
  m1[0][6] = _mm_unpacklo_epi32( m2[0][6], m2[0][7] );
883
0
  m1[0][7] = _mm_unpackhi_epi32( m2[0][6], m2[0][7] );
884
  
885
0
  m1[1][0] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][0], 8 ) );
886
0
  m1[0][0] = _mm_cvtepi16_epi32(                 m1[0][0]      );
887
0
  m1[1][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][1], 8 ) );
888
0
  m1[0][1] = _mm_cvtepi16_epi32(                 m1[0][1]      );
889
0
  m1[1][2] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][2], 8 ) );
890
0
  m1[0][2] = _mm_cvtepi16_epi32(                 m1[0][2]      );
891
0
  m1[1][3] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][3], 8 ) );
892
0
  m1[0][3] = _mm_cvtepi16_epi32(                 m1[0][3]      );
893
0
  m1[1][4] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][4], 8 ) );
894
0
  m1[0][4] = _mm_cvtepi16_epi32(                 m1[0][4]      );
895
0
  m1[1][5] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][5], 8 ) );
896
0
  m1[0][5] = _mm_cvtepi16_epi32(                 m1[0][5]      );
897
0
  m1[1][6] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][6], 8 ) );
898
0
  m1[0][6] = _mm_cvtepi16_epi32(                 m1[0][6]      );
899
0
  m1[1][7] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][7], 8 ) );
900
0
  m1[0][7] = _mm_cvtepi16_epi32(                 m1[0][7]      );
901
902
0
  for( int i = 0; i < 8; i++ )
903
0
  {
904
0
    int ii = i % 4;
905
0
    int ij = i >> 2;
906
907
0
    m2[0][i] = m1[ij][ii    ];
908
0
    m2[1][i] = m1[ij][ii + 4];
909
0
  }
910
911
0
  for( int i = 0; i < 2; i++ )
912
0
  {
913
0
    m1[i][0] = _mm_add_epi32( m2[i][0], m2[i][4] );
914
0
    m1[i][1] = _mm_add_epi32( m2[i][1], m2[i][5] );
915
0
    m1[i][2] = _mm_add_epi32( m2[i][2], m2[i][6] );
916
0
    m1[i][3] = _mm_add_epi32( m2[i][3], m2[i][7] );
917
0
    m1[i][4] = _mm_sub_epi32( m2[i][0], m2[i][4] );
918
0
    m1[i][5] = _mm_sub_epi32( m2[i][1], m2[i][5] );
919
0
    m1[i][6] = _mm_sub_epi32( m2[i][2], m2[i][6] );
920
0
    m1[i][7] = _mm_sub_epi32( m2[i][3], m2[i][7] );
921
922
0
    m2[i][0] = _mm_add_epi32( m1[i][0], m1[i][2] );
923
0
    m2[i][1] = _mm_add_epi32( m1[i][1], m1[i][3] );
924
0
    m2[i][2] = _mm_sub_epi32( m1[i][0], m1[i][2] );
925
0
    m2[i][3] = _mm_sub_epi32( m1[i][1], m1[i][3] );
926
0
    m2[i][4] = _mm_add_epi32( m1[i][4], m1[i][6] );
927
0
    m2[i][5] = _mm_add_epi32( m1[i][5], m1[i][7] );
928
0
    m2[i][6] = _mm_sub_epi32( m1[i][4], m1[i][6] );
929
0
    m2[i][7] = _mm_sub_epi32( m1[i][5], m1[i][7] );
930
931
0
    m1[i][0] = _mm_abs_epi32( _mm_add_epi32( m2[i][0], m2[i][1] ) );
932
0
    m1[i][1] = _mm_abs_epi32( _mm_sub_epi32( m2[i][0], m2[i][1] ) );
933
0
    m1[i][2] = _mm_abs_epi32( _mm_add_epi32( m2[i][2], m2[i][3] ) );
934
0
    m1[i][3] = _mm_abs_epi32( _mm_sub_epi32( m2[i][2], m2[i][3] ) );
935
0
    m1[i][4] = _mm_abs_epi32( _mm_add_epi32( m2[i][4], m2[i][5] ) );
936
0
    m1[i][5] = _mm_abs_epi32( _mm_sub_epi32( m2[i][4], m2[i][5] ) );
937
0
    m1[i][6] = _mm_abs_epi32( _mm_add_epi32( m2[i][6], m2[i][7] ) );
938
0
    m1[i][7] = _mm_abs_epi32( _mm_sub_epi32( m2[i][6], m2[i][7] ) );
939
0
  }
940
0
  m2[0][0] = m1[0][0];
941
0
  for( int i = 0; i < 8; i++ )
942
0
  {
943
0
    m1[0][i] = _mm_add_epi32( m1[0][i], m1[1][i] );
944
0
  }
945
946
0
  m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][1] );
947
0
  m1[0][2] = _mm_add_epi32( m1[0][2], m1[0][3] );
948
0
  m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][5] );
949
0
  m1[0][6] = _mm_add_epi32( m1[0][6], m1[0][7] );
950
951
0
  m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][2] );
952
0
  m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][6] );
953
0
  __m128i iSum = _mm_add_epi32( m1[0][0], m1[0][4] );
954
955
0
  iSum = _mm_hadd_epi32( iSum, iSum );
956
0
  iSum = _mm_hadd_epi32( iSum, iSum );
957
958
0
  uint32_t sad = _mm_cvtsi128_si32( iSum );
959
0
  uint32_t absDc = _mm_cvtsi128_si32( m2[0][0] );
960
0
  sad -= absDc;
961
0
  sad += absDc >> 2;
962
0
  sad = ( ( sad + 2 ) >> 2 );
963
964
0
  return ( sad << 2 );
965
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x16_fast_SSE(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x16_fast_SSE(short const*, short const*, int, int, int)
966
967
968
//working up to 12-bit
969
static uint32_t xCalcHAD16x8_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
970
0
{
971
0
  __m128i m1[16][2][2], m2[16][2][2];
972
0
  __m128i iSum = _mm_setzero_si128();
973
974
0
  for( int l = 0; l < 2; l++ )
975
0
  {
976
0
    const Torg *piOrgPtr = piOrg + l*8;
977
0
    const Tcur *piCurPtr = piCur + l*8;
978
0
    for( int k = 0; k < 8; k++ )
979
0
    {
980
0
      __m128i r0 = _mm_loadu_si128( (__m128i*) piOrgPtr );
981
0
      __m128i r1 = _mm_loadu_si128( (__m128i*) piCurPtr );
982
0
      m2[k][l][0] = _mm_sub_epi16( r0, r1 );
983
0
      m2[k][l][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m2[k][l][0], 8 ) );
984
0
      m2[k][l][0] = _mm_cvtepi16_epi32( m2[k][l][0] );
985
0
      piCurPtr += iStrideCur;
986
0
      piOrgPtr += iStrideOrg;
987
0
    }
988
989
0
    for( int i = 0; i < 2; i++ )
990
0
    {
991
      //vertical
992
0
      m1[0][l][i] = _mm_add_epi32( m2[0][l][i], m2[4][l][i] );
993
0
      m1[1][l][i] = _mm_add_epi32( m2[1][l][i], m2[5][l][i] );
994
0
      m1[2][l][i] = _mm_add_epi32( m2[2][l][i], m2[6][l][i] );
995
0
      m1[3][l][i] = _mm_add_epi32( m2[3][l][i], m2[7][l][i] );
996
0
      m1[4][l][i] = _mm_sub_epi32( m2[0][l][i], m2[4][l][i] );
997
0
      m1[5][l][i] = _mm_sub_epi32( m2[1][l][i], m2[5][l][i] );
998
0
      m1[6][l][i] = _mm_sub_epi32( m2[2][l][i], m2[6][l][i] );
999
0
      m1[7][l][i] = _mm_sub_epi32( m2[3][l][i], m2[7][l][i] );
1000
1001
0
      m2[0][l][i] = _mm_add_epi32( m1[0][l][i], m1[2][l][i] );
1002
0
      m2[1][l][i] = _mm_add_epi32( m1[1][l][i], m1[3][l][i] );
1003
0
      m2[2][l][i] = _mm_sub_epi32( m1[0][l][i], m1[2][l][i] );
1004
0
      m2[3][l][i] = _mm_sub_epi32( m1[1][l][i], m1[3][l][i] );
1005
0
      m2[4][l][i] = _mm_add_epi32( m1[4][l][i], m1[6][l][i] );
1006
0
      m2[5][l][i] = _mm_add_epi32( m1[5][l][i], m1[7][l][i] );
1007
0
      m2[6][l][i] = _mm_sub_epi32( m1[4][l][i], m1[6][l][i] );
1008
0
      m2[7][l][i] = _mm_sub_epi32( m1[5][l][i], m1[7][l][i] );
1009
1010
0
      m1[0][l][i] = _mm_add_epi32( m2[0][l][i], m2[1][l][i] );
1011
0
      m1[1][l][i] = _mm_sub_epi32( m2[0][l][i], m2[1][l][i] );
1012
0
      m1[2][l][i] = _mm_add_epi32( m2[2][l][i], m2[3][l][i] );
1013
0
      m1[3][l][i] = _mm_sub_epi32( m2[2][l][i], m2[3][l][i] );
1014
0
      m1[4][l][i] = _mm_add_epi32( m2[4][l][i], m2[5][l][i] );
1015
0
      m1[5][l][i] = _mm_sub_epi32( m2[4][l][i], m2[5][l][i] );
1016
0
      m1[6][l][i] = _mm_add_epi32( m2[6][l][i], m2[7][l][i] );
1017
0
      m1[7][l][i] = _mm_sub_epi32( m2[6][l][i], m2[7][l][i] );
1018
0
    }
1019
0
  }
1020
1021
  // 4 x 8x4 blocks
1022
  // 0 1
1023
  // 2 3
1024
0
  uint32_t absDc = 0;
1025
1026
  // transpose and do horizontal in two steps
1027
0
  for( int l = 0; l < 2; l++ )
1028
0
  {
1029
0
    int off = l * 4;
1030
1031
0
    __m128i n1[16];
1032
0
    __m128i n2[16];
1033
1034
0
    m2[0][0][0] = _mm_unpacklo_epi32( m1[0 + off][0][0], m1[1 + off][0][0] );
1035
0
    m2[1][0][0] = _mm_unpacklo_epi32( m1[2 + off][0][0], m1[3 + off][0][0] );
1036
0
    m2[2][0][0] = _mm_unpackhi_epi32( m1[0 + off][0][0], m1[1 + off][0][0] );
1037
0
    m2[3][0][0] = _mm_unpackhi_epi32( m1[2 + off][0][0], m1[3 + off][0][0] );
1038
1039
0
    m2[0][0][1] = _mm_unpacklo_epi32( m1[0 + off][0][1], m1[1 + off][0][1] );
1040
0
    m2[1][0][1] = _mm_unpacklo_epi32( m1[2 + off][0][1], m1[3 + off][0][1] );
1041
0
    m2[2][0][1] = _mm_unpackhi_epi32( m1[0 + off][0][1], m1[1 + off][0][1] );
1042
0
    m2[3][0][1] = _mm_unpackhi_epi32( m1[2 + off][0][1], m1[3 + off][0][1] );
1043
1044
0
    n1[0]       = _mm_unpacklo_epi64( m2[0][0][0], m2[1][0][0] );
1045
0
    n1[1]       = _mm_unpackhi_epi64( m2[0][0][0], m2[1][0][0] );
1046
0
    n1[2]       = _mm_unpacklo_epi64( m2[2][0][0], m2[3][0][0] );
1047
0
    n1[3]       = _mm_unpackhi_epi64( m2[2][0][0], m2[3][0][0] );
1048
0
    n1[4]       = _mm_unpacklo_epi64( m2[0][0][1], m2[1][0][1] );
1049
0
    n1[5]       = _mm_unpackhi_epi64( m2[0][0][1], m2[1][0][1] );
1050
0
    n1[6]       = _mm_unpacklo_epi64( m2[2][0][1], m2[3][0][1] );
1051
0
    n1[7]       = _mm_unpackhi_epi64( m2[2][0][1], m2[3][0][1] );
1052
1053
    // transpose 8x4 -> 4x8, block 1(3)
1054
0
    m2[8+0][0][0] = _mm_unpacklo_epi32( m1[0 + off][1][0], m1[1 + off][1][0] );
1055
0
    m2[8+1][0][0] = _mm_unpacklo_epi32( m1[2 + off][1][0], m1[3 + off][1][0] );
1056
0
    m2[8+2][0][0] = _mm_unpackhi_epi32( m1[0 + off][1][0], m1[1 + off][1][0] );
1057
0
    m2[8+3][0][0] = _mm_unpackhi_epi32( m1[2 + off][1][0], m1[3 + off][1][0] );
1058
1059
0
    m2[8+0][0][1] = _mm_unpacklo_epi32( m1[0 + off][1][1], m1[1 + off][1][1] );
1060
0
    m2[8+1][0][1] = _mm_unpacklo_epi32( m1[2 + off][1][1], m1[3 + off][1][1] );
1061
0
    m2[8+2][0][1] = _mm_unpackhi_epi32( m1[0 + off][1][1], m1[1 + off][1][1] );
1062
0
    m2[8+3][0][1] = _mm_unpackhi_epi32( m1[2 + off][1][1], m1[3 + off][1][1] );
1063
1064
0
    n1[8+0]       = _mm_unpacklo_epi64( m2[8+0][0][0], m2[8+1][0][0] );
1065
0
    n1[8+1]       = _mm_unpackhi_epi64( m2[8+0][0][0], m2[8+1][0][0] );
1066
0
    n1[8+2]       = _mm_unpacklo_epi64( m2[8+2][0][0], m2[8+3][0][0] );
1067
0
    n1[8+3]       = _mm_unpackhi_epi64( m2[8+2][0][0], m2[8+3][0][0] );
1068
0
    n1[8+4]       = _mm_unpacklo_epi64( m2[8+0][0][1], m2[8+1][0][1] );
1069
0
    n1[8+5]       = _mm_unpackhi_epi64( m2[8+0][0][1], m2[8+1][0][1] );
1070
0
    n1[8+6]       = _mm_unpacklo_epi64( m2[8+2][0][1], m2[8+3][0][1] );
1071
0
    n1[8+7]       = _mm_unpackhi_epi64( m2[8+2][0][1], m2[8+3][0][1] );
1072
1073
0
    n2[0] = _mm_add_epi32( n1[0], n1[8] );
1074
0
    n2[1] = _mm_add_epi32( n1[1], n1[9] );
1075
0
    n2[2] = _mm_add_epi32( n1[2], n1[10] );
1076
0
    n2[3] = _mm_add_epi32( n1[3], n1[11] );
1077
0
    n2[4] = _mm_add_epi32( n1[4], n1[12] );
1078
0
    n2[5] = _mm_add_epi32( n1[5], n1[13] );
1079
0
    n2[6] = _mm_add_epi32( n1[6], n1[14] );
1080
0
    n2[7] = _mm_add_epi32( n1[7], n1[15] );
1081
0
    n2[8] = _mm_sub_epi32( n1[0], n1[8] );
1082
0
    n2[9] = _mm_sub_epi32( n1[1], n1[9] );
1083
0
    n2[10] = _mm_sub_epi32( n1[2], n1[10] );
1084
0
    n2[11] = _mm_sub_epi32( n1[3], n1[11] );
1085
0
    n2[12] = _mm_sub_epi32( n1[4], n1[12] );
1086
0
    n2[13] = _mm_sub_epi32( n1[5], n1[13] );
1087
0
    n2[14] = _mm_sub_epi32( n1[6], n1[14] );
1088
0
    n2[15] = _mm_sub_epi32( n1[7], n1[15] );
1089
1090
0
    n1[0] = _mm_add_epi32( n2[0], n2[4] );
1091
0
    n1[1] = _mm_add_epi32( n2[1], n2[5] );
1092
0
    n1[2] = _mm_add_epi32( n2[2], n2[6] );
1093
0
    n1[3] = _mm_add_epi32( n2[3], n2[7] );
1094
0
    n1[4] = _mm_sub_epi32( n2[0], n2[4] );
1095
0
    n1[5] = _mm_sub_epi32( n2[1], n2[5] );
1096
0
    n1[6] = _mm_sub_epi32( n2[2], n2[6] );
1097
0
    n1[7] = _mm_sub_epi32( n2[3], n2[7] );
1098
0
    n1[8] = _mm_add_epi32( n2[8], n2[12] );
1099
0
    n1[9] = _mm_add_epi32( n2[9], n2[13] );
1100
0
    n1[10] = _mm_add_epi32( n2[10], n2[14] );
1101
0
    n1[11] = _mm_add_epi32( n2[11], n2[15] );
1102
0
    n1[12] = _mm_sub_epi32( n2[8], n2[12] );
1103
0
    n1[13] = _mm_sub_epi32( n2[9], n2[13] );
1104
0
    n1[14] = _mm_sub_epi32( n2[10], n2[14] );
1105
0
    n1[15] = _mm_sub_epi32( n2[11], n2[15] );
1106
1107
0
    n2[0] = _mm_add_epi32( n1[0], n1[2] );
1108
0
    n2[1] = _mm_add_epi32( n1[1], n1[3] );
1109
0
    n2[2] = _mm_sub_epi32( n1[0], n1[2] );
1110
0
    n2[3] = _mm_sub_epi32( n1[1], n1[3] );
1111
0
    n2[4] = _mm_add_epi32( n1[4], n1[6] );
1112
0
    n2[5] = _mm_add_epi32( n1[5], n1[7] );
1113
0
    n2[6] = _mm_sub_epi32( n1[4], n1[6] );
1114
0
    n2[7] = _mm_sub_epi32( n1[5], n1[7] );
1115
0
    n2[8] = _mm_add_epi32( n1[8], n1[10] );
1116
0
    n2[9] = _mm_add_epi32( n1[9], n1[11] );
1117
0
    n2[10] = _mm_sub_epi32( n1[8], n1[10] );
1118
0
    n2[11] = _mm_sub_epi32( n1[9], n1[11] );
1119
0
    n2[12] = _mm_add_epi32( n1[12], n1[14] );
1120
0
    n2[13] = _mm_add_epi32( n1[13], n1[15] );
1121
0
    n2[14] = _mm_sub_epi32( n1[12], n1[14] );
1122
0
    n2[15] = _mm_sub_epi32( n1[13], n1[15] );
1123
1124
0
    n1[0] = _mm_abs_epi32( _mm_add_epi32( n2[0], n2[1] ) );
1125
0
    n1[1] = _mm_abs_epi32( _mm_sub_epi32( n2[0], n2[1] ) );
1126
0
    n1[2] = _mm_abs_epi32( _mm_add_epi32( n2[2], n2[3] ) );
1127
0
    n1[3] = _mm_abs_epi32( _mm_sub_epi32( n2[2], n2[3] ) );
1128
0
    n1[4] = _mm_abs_epi32( _mm_add_epi32( n2[4], n2[5] ) );
1129
0
    n1[5] = _mm_abs_epi32( _mm_sub_epi32( n2[4], n2[5] ) );
1130
0
    n1[6] = _mm_abs_epi32( _mm_add_epi32( n2[6], n2[7] ) );
1131
0
    n1[7] = _mm_abs_epi32( _mm_sub_epi32( n2[6], n2[7] ) );
1132
0
    n1[8] = _mm_abs_epi32( _mm_add_epi32( n2[8], n2[9] ) );
1133
0
    n1[9] = _mm_abs_epi32( _mm_sub_epi32( n2[8], n2[9] ) );
1134
0
    n1[10] = _mm_abs_epi32( _mm_add_epi32( n2[10], n2[11] ) );
1135
0
    n1[11] = _mm_abs_epi32( _mm_sub_epi32( n2[10], n2[11] ) );
1136
0
    n1[12] = _mm_abs_epi32( _mm_add_epi32( n2[12], n2[13] ) );
1137
0
    n1[13] = _mm_abs_epi32( _mm_sub_epi32( n2[12], n2[13] ) );
1138
0
    n1[14] = _mm_abs_epi32( _mm_add_epi32( n2[14], n2[15] ) );
1139
0
    n1[15] = _mm_abs_epi32( _mm_sub_epi32( n2[14], n2[15] ) );
1140
    
1141
0
    if (l == 0)
1142
0
      absDc = _mm_cvtsi128_si32( n1[0] );
1143
1144
    // sum up
1145
0
    n1[0] = _mm_add_epi32( n1[0], n1[1] );
1146
0
    n1[2] = _mm_add_epi32( n1[2], n1[3] );
1147
0
    n1[4] = _mm_add_epi32( n1[4], n1[5] );
1148
0
    n1[6] = _mm_add_epi32( n1[6], n1[7] );
1149
0
    n1[8] = _mm_add_epi32( n1[8], n1[9] );
1150
0
    n1[10] = _mm_add_epi32( n1[10], n1[11] );
1151
0
    n1[12] = _mm_add_epi32( n1[12], n1[13] );
1152
0
    n1[14] = _mm_add_epi32( n1[14], n1[15] );
1153
1154
0
    n1[0] = _mm_add_epi32( n1[0], n1[2] );
1155
0
    n1[4] = _mm_add_epi32( n1[4], n1[6] );
1156
0
    n1[8] = _mm_add_epi32( n1[8], n1[10] );
1157
0
    n1[12] = _mm_add_epi32( n1[12], n1[14] );
1158
1159
0
    n1[0] = _mm_add_epi32( n1[0], n1[4] );
1160
0
    n1[8] = _mm_add_epi32( n1[8], n1[12] );
1161
1162
0
    n1[0] = _mm_add_epi32( n1[0], n1[8] );
1163
0
    iSum = _mm_add_epi32( iSum, n1[0] );
1164
0
  }
1165
1166
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1167
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1168
1169
0
  uint32_t sad = _mm_cvtsi128_si32( iSum );
1170
0
  sad -= absDc;
1171
0
  sad += absDc >> 2;
1172
0
  sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
1173
1174
0
  return sad;
1175
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x8_SSE(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x8_SSE(short const*, short const*, int, int, int)
1176
1177
1178
//working up to 12-bit
1179
static uint32_t xCalcHAD8x16_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
1180
0
{
1181
0
  __m128i m1[2][16], m2[2][16];
1182
0
  __m128i iSum = _mm_setzero_si128();
1183
1184
0
  for( int k = 0; k < 16; k++ )
1185
0
  {
1186
0
    __m128i r0 =_mm_loadu_si128( (__m128i*)piOrg );
1187
0
    __m128i r1 =_mm_loadu_si128( (__m128i*)piCur );
1188
0
    m1[0][k] = _mm_sub_epi16( r0, r1 );
1189
0
    m1[1][k] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][k], 8 ) );
1190
0
    m1[0][k] = _mm_cvtepi16_epi32( m1[0][k] );
1191
0
    piCur += iStrideCur;
1192
0
    piOrg += iStrideOrg;
1193
0
  }
1194
1195
0
  for( int i = 0; i < 2; i++ )
1196
0
  {
1197
    // vertical
1198
0
    m2[i][ 0] = _mm_add_epi32( m1[i][ 0], m1[i][ 8] );
1199
0
    m2[i][ 1] = _mm_add_epi32( m1[i][ 1], m1[i][ 9] );
1200
0
    m2[i][ 2] = _mm_add_epi32( m1[i][ 2], m1[i][10] );
1201
0
    m2[i][ 3] = _mm_add_epi32( m1[i][ 3], m1[i][11] );
1202
0
    m2[i][ 4] = _mm_add_epi32( m1[i][ 4], m1[i][12] );
1203
0
    m2[i][ 5] = _mm_add_epi32( m1[i][ 5], m1[i][13] );
1204
0
    m2[i][ 6] = _mm_add_epi32( m1[i][ 6], m1[i][14] );
1205
0
    m2[i][ 7] = _mm_add_epi32( m1[i][ 7], m1[i][15] );
1206
0
    m2[i][ 8] = _mm_sub_epi32( m1[i][ 0], m1[i][ 8] );
1207
0
    m2[i][ 9] = _mm_sub_epi32( m1[i][ 1], m1[i][ 9] );
1208
0
    m2[i][10] = _mm_sub_epi32( m1[i][ 2], m1[i][10] );
1209
0
    m2[i][11] = _mm_sub_epi32( m1[i][ 3], m1[i][11] );
1210
0
    m2[i][12] = _mm_sub_epi32( m1[i][ 4], m1[i][12] );
1211
0
    m2[i][13] = _mm_sub_epi32( m1[i][ 5], m1[i][13] );
1212
0
    m2[i][14] = _mm_sub_epi32( m1[i][ 6], m1[i][14] );
1213
0
    m2[i][15] = _mm_sub_epi32( m1[i][ 7], m1[i][15] );
1214
1215
0
    m1[i][ 0] = _mm_add_epi32( m2[i][ 0], m2[i][ 4] );
1216
0
    m1[i][ 1] = _mm_add_epi32( m2[i][ 1], m2[i][ 5] );
1217
0
    m1[i][ 2] = _mm_add_epi32( m2[i][ 2], m2[i][ 6] );
1218
0
    m1[i][ 3] = _mm_add_epi32( m2[i][ 3], m2[i][ 7] );
1219
0
    m1[i][ 4] = _mm_sub_epi32( m2[i][ 0], m2[i][ 4] );
1220
0
    m1[i][ 5] = _mm_sub_epi32( m2[i][ 1], m2[i][ 5] );
1221
0
    m1[i][ 6] = _mm_sub_epi32( m2[i][ 2], m2[i][ 6] );
1222
0
    m1[i][ 7] = _mm_sub_epi32( m2[i][ 3], m2[i][ 7] );
1223
0
    m1[i][ 8] = _mm_add_epi32( m2[i][ 8], m2[i][12] );
1224
0
    m1[i][ 9] = _mm_add_epi32( m2[i][ 9], m2[i][13] );
1225
0
    m1[i][10] = _mm_add_epi32( m2[i][10], m2[i][14] );
1226
0
    m1[i][11] = _mm_add_epi32( m2[i][11], m2[i][15] );
1227
0
    m1[i][12] = _mm_sub_epi32( m2[i][ 8], m2[i][12] );
1228
0
    m1[i][13] = _mm_sub_epi32( m2[i][ 9], m2[i][13] );
1229
0
    m1[i][14] = _mm_sub_epi32( m2[i][10], m2[i][14] );
1230
0
    m1[i][15] = _mm_sub_epi32( m2[i][11], m2[i][15] );
1231
1232
0
    m2[i][ 0] = _mm_add_epi32( m1[i][ 0], m1[i][ 2] );
1233
0
    m2[i][ 1] = _mm_add_epi32( m1[i][ 1], m1[i][ 3] );
1234
0
    m2[i][ 2] = _mm_sub_epi32( m1[i][ 0], m1[i][ 2] );
1235
0
    m2[i][ 3] = _mm_sub_epi32( m1[i][ 1], m1[i][ 3] );
1236
0
    m2[i][ 4] = _mm_add_epi32( m1[i][ 4], m1[i][ 6] );
1237
0
    m2[i][ 5] = _mm_add_epi32( m1[i][ 5], m1[i][ 7] );
1238
0
    m2[i][ 6] = _mm_sub_epi32( m1[i][ 4], m1[i][ 6] );
1239
0
    m2[i][ 7] = _mm_sub_epi32( m1[i][ 5], m1[i][ 7] );
1240
0
    m2[i][ 8] = _mm_add_epi32( m1[i][ 8], m1[i][10] );
1241
0
    m2[i][ 9] = _mm_add_epi32( m1[i][ 9], m1[i][11] );
1242
0
    m2[i][10] = _mm_sub_epi32( m1[i][ 8], m1[i][10] );
1243
0
    m2[i][11] = _mm_sub_epi32( m1[i][ 9], m1[i][11] );
1244
0
    m2[i][12] = _mm_add_epi32( m1[i][12], m1[i][14] );
1245
0
    m2[i][13] = _mm_add_epi32( m1[i][13], m1[i][15] );
1246
0
    m2[i][14] = _mm_sub_epi32( m1[i][12], m1[i][14] );
1247
0
    m2[i][15] = _mm_sub_epi32( m1[i][13], m1[i][15] );
1248
1249
0
    m1[i][ 0] = _mm_add_epi32( m2[i][ 0], m2[i][ 1] );
1250
0
    m1[i][ 1] = _mm_sub_epi32( m2[i][ 0], m2[i][ 1] );
1251
0
    m1[i][ 2] = _mm_add_epi32( m2[i][ 2], m2[i][ 3] );
1252
0
    m1[i][ 3] = _mm_sub_epi32( m2[i][ 2], m2[i][ 3] );
1253
0
    m1[i][ 4] = _mm_add_epi32( m2[i][ 4], m2[i][ 5] );
1254
0
    m1[i][ 5] = _mm_sub_epi32( m2[i][ 4], m2[i][ 5] );
1255
0
    m1[i][ 6] = _mm_add_epi32( m2[i][ 6], m2[i][ 7] );
1256
0
    m1[i][ 7] = _mm_sub_epi32( m2[i][ 6], m2[i][ 7] );
1257
0
    m1[i][ 8] = _mm_add_epi32( m2[i][ 8], m2[i][ 9] );
1258
0
    m1[i][ 9] = _mm_sub_epi32( m2[i][ 8], m2[i][ 9] );
1259
0
    m1[i][10] = _mm_add_epi32( m2[i][10], m2[i][11] );
1260
0
    m1[i][11] = _mm_sub_epi32( m2[i][10], m2[i][11] );
1261
0
    m1[i][12] = _mm_add_epi32( m2[i][12], m2[i][13] );
1262
0
    m1[i][13] = _mm_sub_epi32( m2[i][12], m2[i][13] );
1263
0
    m1[i][14] = _mm_add_epi32( m2[i][14], m2[i][15] );
1264
0
    m1[i][15] = _mm_sub_epi32( m2[i][14], m2[i][15] );
1265
0
  }
1266
1267
  // process horizontal in two steps ( 2 x 8x8 blocks )
1268
1269
0
  for( int l = 0; l < 4; l++ )
1270
0
  {
1271
0
    int off = l * 4;
1272
1273
0
    for( int i = 0; i < 2; i++ )
1274
0
    {
1275
      // transpose 4x4
1276
0
      m2[i][0 + off] = _mm_unpacklo_epi32( m1[i][0 + off], m1[i][1 + off] );
1277
0
      m2[i][1 + off] = _mm_unpackhi_epi32( m1[i][0 + off], m1[i][1 + off] );
1278
0
      m2[i][2 + off] = _mm_unpacklo_epi32( m1[i][2 + off], m1[i][3 + off] );
1279
0
      m2[i][3 + off] = _mm_unpackhi_epi32( m1[i][2 + off], m1[i][3 + off] );
1280
1281
0
      m1[i][0 + off] = _mm_unpacklo_epi64( m2[i][0 + off], m2[i][2 + off] );
1282
0
      m1[i][1 + off] = _mm_unpackhi_epi64( m2[i][0 + off], m2[i][2 + off] );
1283
0
      m1[i][2 + off] = _mm_unpacklo_epi64( m2[i][1 + off], m2[i][3 + off] );
1284
0
      m1[i][3 + off] = _mm_unpackhi_epi64( m2[i][1 + off], m2[i][3 + off] );
1285
0
    }
1286
0
  }
1287
1288
0
  uint32_t absDc = 0;
1289
1290
0
  for( int l = 0; l < 2; l++ )
1291
0
  {
1292
0
    int off = l * 8;
1293
1294
0
    __m128i n1[2][8];
1295
0
    __m128i n2[2][8];
1296
1297
0
    for( int i = 0; i < 8; i++ )
1298
0
    {
1299
0
      int ii = i % 4;
1300
0
      int ij = i >> 2;
1301
1302
0
      n2[0][i] = m1[ij][off + ii    ];
1303
0
      n2[1][i] = m1[ij][off + ii + 4];
1304
0
    }
1305
1306
0
    for( int i = 0; i < 2; i++ )
1307
0
    {
1308
0
      n1[i][0] = _mm_add_epi32( n2[i][0], n2[i][4] );
1309
0
      n1[i][1] = _mm_add_epi32( n2[i][1], n2[i][5] );
1310
0
      n1[i][2] = _mm_add_epi32( n2[i][2], n2[i][6] );
1311
0
      n1[i][3] = _mm_add_epi32( n2[i][3], n2[i][7] );
1312
0
      n1[i][4] = _mm_sub_epi32( n2[i][0], n2[i][4] );
1313
0
      n1[i][5] = _mm_sub_epi32( n2[i][1], n2[i][5] );
1314
0
      n1[i][6] = _mm_sub_epi32( n2[i][2], n2[i][6] );
1315
0
      n1[i][7] = _mm_sub_epi32( n2[i][3], n2[i][7] );
1316
1317
0
      n2[i][0] = _mm_add_epi32( n1[i][0], n1[i][2] );
1318
0
      n2[i][1] = _mm_add_epi32( n1[i][1], n1[i][3] );
1319
0
      n2[i][2] = _mm_sub_epi32( n1[i][0], n1[i][2] );
1320
0
      n2[i][3] = _mm_sub_epi32( n1[i][1], n1[i][3] );
1321
0
      n2[i][4] = _mm_add_epi32( n1[i][4], n1[i][6] );
1322
0
      n2[i][5] = _mm_add_epi32( n1[i][5], n1[i][7] );
1323
0
      n2[i][6] = _mm_sub_epi32( n1[i][4], n1[i][6] );
1324
0
      n2[i][7] = _mm_sub_epi32( n1[i][5], n1[i][7] );
1325
1326
0
      n1[i][0] = _mm_abs_epi32( _mm_add_epi32( n2[i][0], n2[i][1] ) );
1327
0
      n1[i][1] = _mm_abs_epi32( _mm_sub_epi32( n2[i][0], n2[i][1] ) );
1328
0
      n1[i][2] = _mm_abs_epi32( _mm_add_epi32( n2[i][2], n2[i][3] ) );
1329
0
      n1[i][3] = _mm_abs_epi32( _mm_sub_epi32( n2[i][2], n2[i][3] ) );
1330
0
      n1[i][4] = _mm_abs_epi32( _mm_add_epi32( n2[i][4], n2[i][5] ) );
1331
0
      n1[i][5] = _mm_abs_epi32( _mm_sub_epi32( n2[i][4], n2[i][5] ) );
1332
0
      n1[i][6] = _mm_abs_epi32( _mm_add_epi32( n2[i][6], n2[i][7] ) );
1333
0
      n1[i][7] = _mm_abs_epi32( _mm_sub_epi32( n2[i][6], n2[i][7] ) );
1334
      
1335
0
      if ( l + i == 0 )
1336
0
        absDc = _mm_cvtsi128_si32( n1[i][0] );
1337
0
    }
1338
1339
0
    for( int i = 0; i < 8; i++ )
1340
0
    {
1341
0
      n2[0][i] = _mm_add_epi32( n1[0][i], n1[1][i] );
1342
0
    }
1343
1344
0
    n2[0][0] = _mm_add_epi32( n2[0][0], n2[0][1] );
1345
0
    n2[0][2] = _mm_add_epi32( n2[0][2], n2[0][3] );
1346
0
    n2[0][4] = _mm_add_epi32( n2[0][4], n2[0][5] );
1347
0
    n2[0][6] = _mm_add_epi32( n2[0][6], n2[0][7] );
1348
1349
0
    n2[0][0] = _mm_add_epi32( n2[0][0], n2[0][2] );
1350
0
    n2[0][4] = _mm_add_epi32( n2[0][4], n2[0][6] );
1351
0
    iSum = _mm_add_epi32( iSum, _mm_add_epi32( n2[0][0], n2[0][4] ) );
1352
0
  }
1353
1354
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1355
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1356
1357
0
  uint32_t sad = _mm_cvtsi128_si32( iSum );
1358
0
  sad -= absDc;
1359
0
  sad += absDc >> 2;
1360
0
  sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
1361
1362
0
  return sad;
1363
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD8x16_SSE(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD8x16_SSE(short const*, short const*, int, int, int)
1364
1365
1366
template< typename Torg, typename Tcur >
1367
static uint32_t xCalcHAD8x4_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
1368
0
{
1369
0
  __m128i m1[8], m2[8];
1370
0
  __m128i vzero = _mm_setzero_si128();
1371
1372
0
  for( int k = 0; k < 4; k++ )
1373
0
  {
1374
0
    __m128i r0 = (sizeof( Torg ) > 1) ? (_mm_loadu_si128 ( (__m128i*)piOrg )) : (_mm_unpacklo_epi8( _vv_loadl_epi64( (const __m128i*)piOrg ), _mm_setzero_si128() ));
1375
0
    __m128i r1 = (sizeof( Tcur ) > 1) ? (_mm_loadu_si128( (__m128i*)piCur )) : (_mm_unpacklo_epi8( _vv_loadl_epi64( (const __m128i*)piCur ), _mm_setzero_si128() )); // th  _mm_loadu_si128( (__m128i*)piCur )
1376
0
    m1[k] = _mm_sub_epi16( r0, r1 );
1377
0
    piCur += iStrideCur;
1378
0
    piOrg += iStrideOrg;
1379
0
  }
1380
1381
  //vertical
1382
0
  m2[0] = _mm_add_epi16( m1[0], m1[2] );
1383
0
  m2[1] = _mm_add_epi16( m1[1], m1[3] );
1384
0
  m2[2] = _mm_sub_epi16( m1[0], m1[2] );
1385
0
  m2[3] = _mm_sub_epi16( m1[1], m1[3] );
1386
1387
0
  m1[0] = _mm_add_epi16( m2[0], m2[1] );
1388
0
  m1[1] = _mm_sub_epi16( m2[0], m2[1] );
1389
0
  m1[2] = _mm_add_epi16( m2[2], m2[3] );
1390
0
  m1[3] = _mm_sub_epi16( m2[2], m2[3] );
1391
1392
  // transpose, partially
1393
0
  {
1394
0
    m2[0] = _mm_unpacklo_epi16( m1[0], m1[1] );
1395
0
    m2[1] = _mm_unpacklo_epi16( m1[2], m1[3] );
1396
0
    m2[2] = _mm_unpackhi_epi16( m1[0], m1[1] );
1397
0
    m2[3] = _mm_unpackhi_epi16( m1[2], m1[3] );
1398
1399
0
    m1[0] = _mm_unpacklo_epi32( m2[0], m2[1] );
1400
0
    m1[1] = _mm_unpackhi_epi32( m2[0], m2[1] );
1401
0
    m1[2] = _mm_unpacklo_epi32( m2[2], m2[3] );
1402
0
    m1[3] = _mm_unpackhi_epi32( m2[2], m2[3] );
1403
0
  }
1404
1405
  // horizontal
1406
0
  if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ )
1407
0
  {
1408
    // finish transpose
1409
0
    m2[0] = _mm_unpacklo_epi64( m1[0], vzero );
1410
0
    m2[1] = _mm_unpackhi_epi64( m1[0], vzero );
1411
0
    m2[2] = _mm_unpacklo_epi64( m1[1], vzero );
1412
0
    m2[3] = _mm_unpackhi_epi64( m1[1], vzero );
1413
0
    m2[4] = _mm_unpacklo_epi64( m1[2], vzero );
1414
0
    m2[5] = _mm_unpackhi_epi64( m1[2], vzero );
1415
0
    m2[6] = _mm_unpacklo_epi64( m1[3], vzero );
1416
0
    m2[7] = _mm_unpackhi_epi64( m1[3], vzero );
1417
1418
0
    for( int i = 0; i < 8; i++ )
1419
0
    {
1420
0
      m2[i] = _mm_cvtepi16_epi32( m2[i] );
1421
0
    }
1422
1423
0
    m1[0] = _mm_add_epi32( m2[0], m2[4] );
1424
0
    m1[1] = _mm_add_epi32( m2[1], m2[5] );
1425
0
    m1[2] = _mm_add_epi32( m2[2], m2[6] );
1426
0
    m1[3] = _mm_add_epi32( m2[3], m2[7] );
1427
0
    m1[4] = _mm_sub_epi32( m2[0], m2[4] );
1428
0
    m1[5] = _mm_sub_epi32( m2[1], m2[5] );
1429
0
    m1[6] = _mm_sub_epi32( m2[2], m2[6] );
1430
0
    m1[7] = _mm_sub_epi32( m2[3], m2[7] );
1431
1432
0
    m2[0] = _mm_add_epi32( m1[0], m1[2] );
1433
0
    m2[1] = _mm_add_epi32( m1[1], m1[3] );
1434
0
    m2[2] = _mm_sub_epi32( m1[0], m1[2] );
1435
0
    m2[3] = _mm_sub_epi32( m1[1], m1[3] );
1436
0
    m2[4] = _mm_add_epi32( m1[4], m1[6] );
1437
0
    m2[5] = _mm_add_epi32( m1[5], m1[7] );
1438
0
    m2[6] = _mm_sub_epi32( m1[4], m1[6] );
1439
0
    m2[7] = _mm_sub_epi32( m1[5], m1[7] );
1440
1441
0
    m1[0] = _mm_abs_epi32( _mm_add_epi32( m2[0], m2[1] ) );
1442
0
    m1[1] = _mm_abs_epi32( _mm_sub_epi32( m2[0], m2[1] ) );
1443
0
    m1[2] = _mm_abs_epi32( _mm_add_epi32( m2[2], m2[3] ) );
1444
0
    m1[3] = _mm_abs_epi32( _mm_sub_epi32( m2[2], m2[3] ) );
1445
0
    m1[4] = _mm_abs_epi32( _mm_add_epi32( m2[4], m2[5] ) );
1446
0
    m1[5] = _mm_abs_epi32( _mm_sub_epi32( m2[4], m2[5] ) );
1447
0
    m1[6] = _mm_abs_epi32( _mm_add_epi32( m2[6], m2[7] ) );
1448
0
    m1[7] = _mm_abs_epi32( _mm_sub_epi32( m2[6], m2[7] ) );
1449
0
  }
1450
0
  else
1451
0
  {
1452
0
    m2[0] = _mm_add_epi16( m1[0], m1[2] );
1453
0
    m2[1] = _mm_add_epi16( m1[1], m1[3] );
1454
0
    m2[2] = _mm_sub_epi16( m1[0], m1[2] );
1455
0
    m2[3] = _mm_sub_epi16( m1[1], m1[3] );
1456
1457
0
    m1[0] = _mm_add_epi16( m2[0], m2[1] );
1458
0
    m1[1] = _mm_sub_epi16( m2[0], m2[1] );
1459
0
    m1[2] = _mm_add_epi16( m2[2], m2[3] );
1460
0
    m1[3] = _mm_sub_epi16( m2[2], m2[3] );
1461
1462
    // finish transpose
1463
0
    m2[0] = _mm_unpacklo_epi64( m1[0], vzero );
1464
0
    m2[1] = _mm_unpackhi_epi64( m1[0], vzero );
1465
0
    m2[2] = _mm_unpacklo_epi64( m1[1], vzero );
1466
0
    m2[3] = _mm_unpackhi_epi64( m1[1], vzero );
1467
0
    m2[4] = _mm_unpacklo_epi64( m1[2], vzero );
1468
0
    m2[5] = _mm_unpackhi_epi64( m1[2], vzero );
1469
0
    m2[6] = _mm_unpacklo_epi64( m1[3], vzero );
1470
0
    m2[7] = _mm_unpackhi_epi64( m1[3], vzero );
1471
1472
0
    m1[0] = _mm_abs_epi16( _mm_add_epi16( m2[0], m2[1] ) );
1473
0
    m1[1] = _mm_abs_epi16( _mm_sub_epi16( m2[0], m2[1] ) );
1474
0
    m1[2] = _mm_abs_epi16( _mm_add_epi16( m2[2], m2[3] ) );
1475
0
    m1[3] = _mm_abs_epi16( _mm_sub_epi16( m2[2], m2[3] ) );
1476
0
    m1[4] = _mm_abs_epi16( _mm_add_epi16( m2[4], m2[5] ) );
1477
0
    m1[5] = _mm_abs_epi16( _mm_sub_epi16( m2[4], m2[5] ) );
1478
0
    m1[6] = _mm_abs_epi16( _mm_add_epi16( m2[6], m2[7] ) );
1479
0
    m1[7] = _mm_abs_epi16( _mm_sub_epi16( m2[6], m2[7] ) );
1480
1481
0
    for( int i = 0; i < 8; i++ )
1482
0
    {
1483
0
      m1[i] = _mm_unpacklo_epi16( m1[i], vzero );
1484
0
    }
1485
0
  }
1486
  
1487
0
  uint32_t absDc = _mm_cvtsi128_si32( m1[0] );
1488
1489
0
  m1[0] = _mm_add_epi32( m1[0], m1[1] );
1490
0
  m1[1] = _mm_add_epi32( m1[2], m1[3] );
1491
0
  m1[2] = _mm_add_epi32( m1[4], m1[5] );
1492
0
  m1[3] = _mm_add_epi32( m1[6], m1[7] );
1493
1494
0
  m1[0] = _mm_add_epi32( m1[0], m1[1] );
1495
0
  m1[1] = _mm_add_epi32( m1[2], m1[3] );
1496
1497
0
  __m128i iSum = _mm_add_epi32( m1[0], m1[1] );
1498
1499
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1500
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1501
1502
0
  uint32_t sad = _mm_cvtsi128_si32( iSum );
1503
0
  sad -= absDc;
1504
0
  sad += absDc >> 2;
1505
0
  sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2);
1506
0
  return sad;
1507
0
}
Unexecuted instantiation: RdCost_sse41.cpp:unsigned int vvenc::xCalcHAD8x4_SSE<short, short>(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:unsigned int vvenc::xCalcHAD8x4_SSE<short, short>(short const*, short const*, int, int, int)
1508
1509
static uint32_t xCalcHAD4x8_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
1510
0
{
1511
0
  __m128i m1[8], m2[8];
1512
1513
0
  for( int k = 0; k < 8; k++ )
1514
0
  {
1515
0
    __m128i r0 = (sizeof( Torg ) > 1) ? (_vv_loadl_epi64( (__m128i*)piOrg )) : (_mm_cvtsi32_si128( *(const int*)piOrg ));
1516
0
    __m128i r1 = (sizeof( Tcur ) > 1) ? (_vv_loadl_epi64( (__m128i*)piCur )) : (_mm_cvtsi32_si128( *(const int*)piCur ));
1517
0
    m2[k] = _mm_sub_epi16( r0, r1 );
1518
0
    piCur += iStrideCur;
1519
0
    piOrg += iStrideOrg;
1520
0
  }
1521
1522
1523
  // vertical
1524
1525
0
  m1[0] = _mm_add_epi16( m2[0], m2[4] );
1526
0
  m1[1] = _mm_add_epi16( m2[1], m2[5] );
1527
0
  m1[2] = _mm_add_epi16( m2[2], m2[6] );
1528
0
  m1[3] = _mm_add_epi16( m2[3], m2[7] );
1529
0
  m1[4] = _mm_sub_epi16( m2[0], m2[4] );
1530
0
  m1[5] = _mm_sub_epi16( m2[1], m2[5] );
1531
0
  m1[6] = _mm_sub_epi16( m2[2], m2[6] );
1532
0
  m1[7] = _mm_sub_epi16( m2[3], m2[7] );
1533
1534
0
  m2[0] = _mm_add_epi16( m1[0], m1[2] );
1535
0
  m2[1] = _mm_add_epi16( m1[1], m1[3] );
1536
0
  m2[2] = _mm_sub_epi16( m1[0], m1[2] );
1537
0
  m2[3] = _mm_sub_epi16( m1[1], m1[3] );
1538
0
  m2[4] = _mm_add_epi16( m1[4], m1[6] );
1539
0
  m2[5] = _mm_add_epi16( m1[5], m1[7] );
1540
0
  m2[6] = _mm_sub_epi16( m1[4], m1[6] );
1541
0
  m2[7] = _mm_sub_epi16( m1[5], m1[7] );
1542
1543
0
  m1[0] = _mm_add_epi16( m2[0], m2[1] );
1544
0
  m1[1] = _mm_sub_epi16( m2[0], m2[1] );
1545
0
  m1[2] = _mm_add_epi16( m2[2], m2[3] );
1546
0
  m1[3] = _mm_sub_epi16( m2[2], m2[3] );
1547
0
  m1[4] = _mm_add_epi16( m2[4], m2[5] );
1548
0
  m1[5] = _mm_sub_epi16( m2[4], m2[5] );
1549
0
  m1[6] = _mm_add_epi16( m2[6], m2[7] );
1550
0
  m1[7] = _mm_sub_epi16( m2[6], m2[7] );
1551
1552
1553
  // horizontal
1554
  // transpose
1555
0
  {
1556
0
    m2[0] = _mm_unpacklo_epi16( m1[0], m1[1] );
1557
0
    m2[1] = _mm_unpacklo_epi16( m1[2], m1[3] );
1558
0
    m2[2] = _mm_unpacklo_epi16( m1[4], m1[5] );
1559
0
    m2[3] = _mm_unpacklo_epi16( m1[6], m1[7] );
1560
1561
0
    m1[0] = _mm_unpacklo_epi32( m2[0], m2[1] );
1562
0
    m1[1] = _mm_unpackhi_epi32( m2[0], m2[1] );
1563
0
    m1[2] = _mm_unpacklo_epi32( m2[2], m2[3] );
1564
0
    m1[3] = _mm_unpackhi_epi32( m2[2], m2[3] );
1565
1566
0
    m2[0] = _mm_unpacklo_epi64( m1[0], m1[2] );
1567
0
    m2[1] = _mm_unpackhi_epi64( m1[0], m1[2] );
1568
0
    m2[2] = _mm_unpacklo_epi64( m1[1], m1[3] );
1569
0
    m2[3] = _mm_unpackhi_epi64( m1[1], m1[3] );
1570
0
  }
1571
1572
0
  uint32_t absDc = 0;
1573
1574
0
  if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ )
1575
0
  {
1576
0
    __m128i n1[4][2];
1577
0
    __m128i n2[4][2];
1578
1579
0
    for( int i = 0; i < 4; i++ )
1580
0
    {
1581
0
      n1[i][0] = _mm_cvtepi16_epi32( m2[i] );
1582
0
      n1[i][1] = _mm_cvtepi16_epi32( _mm_shuffle_epi32( m2[i], 0xEE ) );
1583
0
    }
1584
1585
0
    for( int i = 0; i < 2; i++ )
1586
0
    {
1587
0
      n2[0][i] = _mm_add_epi32( n1[0][i], n1[2][i] );
1588
0
      n2[1][i] = _mm_add_epi32( n1[1][i], n1[3][i] );
1589
0
      n2[2][i] = _mm_sub_epi32( n1[0][i], n1[2][i] );
1590
0
      n2[3][i] = _mm_sub_epi32( n1[1][i], n1[3][i] );
1591
1592
0
      n1[0][i] = _mm_abs_epi32( _mm_add_epi32( n2[0][i], n2[1][i] ) );
1593
0
      n1[1][i] = _mm_abs_epi32( _mm_sub_epi32( n2[0][i], n2[1][i] ) );
1594
0
      n1[2][i] = _mm_abs_epi32( _mm_add_epi32( n2[2][i], n2[3][i] ) );
1595
0
      n1[3][i] = _mm_abs_epi32( _mm_sub_epi32( n2[2][i], n2[3][i] ) );
1596
0
    }
1597
0
    for( int i = 0; i < 4; i++ )
1598
0
    {
1599
0
      m1[i] = _mm_add_epi32( n1[i][0], n1[i][1] );
1600
0
    }
1601
1602
0
    absDc = _mm_cvtsi128_si32( n1[0][0] );
1603
0
  }
1604
0
  else
1605
0
  {
1606
0
    m1[0] = _mm_add_epi16( m2[0], m2[2] );
1607
0
    m1[1] = _mm_add_epi16( m2[1], m2[3] );
1608
0
    m1[2] = _mm_sub_epi16( m2[0], m2[2] );
1609
0
    m1[3] = _mm_sub_epi16( m2[1], m2[3] );
1610
1611
0
    m2[0] = _mm_abs_epi16( _mm_add_epi16( m1[0], m1[1] ) );
1612
0
    m2[1] = _mm_abs_epi16( _mm_sub_epi16( m1[0], m1[1] ) );
1613
0
    m2[2] = _mm_abs_epi16( _mm_add_epi16( m1[2], m1[3] ) );
1614
0
    m2[3] = _mm_abs_epi16( _mm_sub_epi16( m1[2], m1[3] ) );
1615
1616
0
    __m128i ma1, ma2;
1617
0
    __m128i vzero = _mm_setzero_si128();
1618
1619
0
    for( int i = 0; i < 4; i++ )
1620
0
    {
1621
0
      ma1 = _mm_unpacklo_epi16( m2[i], vzero );
1622
0
      ma2 = _mm_unpackhi_epi16( m2[i], vzero );
1623
0
      m1[i] = _mm_add_epi32( ma1, ma2 );
1624
0
    }
1625
1626
0
    absDc = _mm_cvtsi128_si32( m2[0] ) & 0x0000ffff;
1627
0
  }
1628
1629
0
  m1[0] = _mm_add_epi32( m1[0], m1[1] );
1630
0
  m1[2] = _mm_add_epi32( m1[2], m1[3] );
1631
1632
0
  __m128i iSum = _mm_add_epi32( m1[0], m1[2] );
1633
1634
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1635
0
  iSum = _mm_hadd_epi32( iSum, iSum );
1636
1637
0
  uint32_t sad = _mm_cvtsi128_si32( iSum );
1638
  
1639
0
  sad -= absDc;
1640
0
  sad += absDc >> 2;
1641
0
  sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2);
1642
1643
0
  return sad;
1644
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD4x8_SSE(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD4x8_SSE(short const*, short const*, int, int, int)
1645
1646
static uint32_t xCalcHAD32x32_fast_AVX2( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
1647
0
{
1648
0
  uint32_t sad = 0;
1649
1650
0
#ifdef USE_AVX2
1651
0
  const int iLoops = 2;
1652
0
  __m256i m1[2][8], m2[2][8];
1653
1654
0
  CHECK( iBitDepth > 10, "Only bitdepths up to 10 supported!" );
1655
1656
0
  for( int l = 0; l < iLoops; l++ )
1657
0
  {
1658
0
    for( int k = 0; k < 8; k++ )
1659
0
    {
1660
0
      __m256i r0 = _mm256_loadu_si256( ( __m256i* ) piOrg );
1661
0
      __m256i r1 = _mm256_loadu_si256( ( __m256i* ) piCur );
1662
0
      __m256i r2 = _mm256_loadu_si256( ( __m256i* ) ( piOrg + iStrideOrg ) );
1663
0
      __m256i r3 = _mm256_loadu_si256( ( __m256i* ) ( piCur + iStrideCur ) );
1664
1665
0
      r0 = _mm256_add_epi16( r0, r2 );
1666
0
      r1 = _mm256_add_epi16( r1, r3 );
1667
1668
0
      __m256i r4 = _mm256_loadu_si256( ( __m256i* ) ( piOrg + 16 ) );
1669
0
      __m256i r5 = _mm256_loadu_si256( ( __m256i* ) ( piCur + 16 ) );
1670
0
      r2 = _mm256_loadu_si256( ( __m256i* ) ( piOrg + iStrideOrg + 16 ) );
1671
0
      r3 = _mm256_loadu_si256( ( __m256i* ) ( piCur + iStrideCur + 16 ) );
1672
1673
0
      r2 = _mm256_add_epi16( r4, r2 );
1674
0
      r3 = _mm256_add_epi16( r5, r3 );
1675
1676
0
      r0 = _mm256_hadd_epi16( r0, r2 );
1677
0
      r1 = _mm256_hadd_epi16( r1, r3 );
1678
1679
0
      r0 = _mm256_add_epi16( r0, _mm256_set1_epi16( 2 ) );
1680
0
      r1 = _mm256_add_epi16( r1, _mm256_set1_epi16( 2 ) );
1681
1682
0
      r0 = _mm256_srai_epi16( r0, 2 );
1683
0
      r1 = _mm256_srai_epi16( r1, 2 );
1684
1685
0
      m2[0][k] = _mm256_permute4x64_epi64( _mm256_sub_epi16( r0, r1 ), 0 + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); // 11 bit
1686
      //m2[1][k] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m2[0][k], 1 ) );
1687
      //m2[0][k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128( m2[0][k] ) );
1688
0
      piCur += iStrideCur * 2;
1689
0
      piOrg += iStrideOrg * 2;
1690
0
    }
1691
1692
0
    m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][4] );
1693
0
    m1[0][1] = _mm256_add_epi16( m2[0][1], m2[0][5] );
1694
0
    m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][6] );
1695
0
    m1[0][3] = _mm256_add_epi16( m2[0][3], m2[0][7] );
1696
0
    m1[0][4] = _mm256_sub_epi16( m2[0][0], m2[0][4] );
1697
0
    m1[0][5] = _mm256_sub_epi16( m2[0][1], m2[0][5] );
1698
0
    m1[0][6] = _mm256_sub_epi16( m2[0][2], m2[0][6] );
1699
0
    m1[0][7] = _mm256_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit
1700
1701
0
    m2[0][0] = _mm256_add_epi16( m1[0][0], m1[0][2] );
1702
0
    m2[0][1] = _mm256_add_epi16( m1[0][1], m1[0][3] );
1703
0
    m2[0][2] = _mm256_sub_epi16( m1[0][0], m1[0][2] );
1704
0
    m2[0][3] = _mm256_sub_epi16( m1[0][1], m1[0][3] );
1705
0
    m2[0][4] = _mm256_add_epi16( m1[0][4], m1[0][6] );
1706
0
    m2[0][5] = _mm256_add_epi16( m1[0][5], m1[0][7] );
1707
0
    m2[0][6] = _mm256_sub_epi16( m1[0][4], m1[0][6] );
1708
0
    m2[0][7] = _mm256_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit
1709
1710
0
    m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][1] );
1711
0
    m1[0][1] = _mm256_sub_epi16( m2[0][0], m2[0][1] );
1712
0
    m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][3] );
1713
0
    m1[0][3] = _mm256_sub_epi16( m2[0][2], m2[0][3] );
1714
0
    m1[0][4] = _mm256_add_epi16( m2[0][4], m2[0][5] );
1715
0
    m1[0][5] = _mm256_sub_epi16( m2[0][4], m2[0][5] );
1716
0
    m1[0][6] = _mm256_add_epi16( m2[0][6], m2[0][7] );
1717
0
    m1[0][7] = _mm256_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit
1718
1719
    // transpose
1720
    // 8x8
1721
0
    m2[0][0] = _mm256_unpacklo_epi16( m1[0][0], m1[0][1] );
1722
0
    m2[0][1] = _mm256_unpacklo_epi16( m1[0][2], m1[0][3] );
1723
0
    m2[0][2] = _mm256_unpacklo_epi16( m1[0][4], m1[0][5] );
1724
0
    m2[0][3] = _mm256_unpacklo_epi16( m1[0][6], m1[0][7] );
1725
0
    m2[0][4] = _mm256_unpackhi_epi16( m1[0][0], m1[0][1] );
1726
0
    m2[0][5] = _mm256_unpackhi_epi16( m1[0][2], m1[0][3] );
1727
0
    m2[0][6] = _mm256_unpackhi_epi16( m1[0][4], m1[0][5] );
1728
0
    m2[0][7] = _mm256_unpackhi_epi16( m1[0][6], m1[0][7] );
1729
1730
0
    m1[0][0] = _mm256_unpacklo_epi32( m2[0][0], m2[0][1] );
1731
0
    m1[0][1] = _mm256_unpackhi_epi32( m2[0][0], m2[0][1] );
1732
0
    m1[0][2] = _mm256_unpacklo_epi32( m2[0][2], m2[0][3] );
1733
0
    m1[0][3] = _mm256_unpackhi_epi32( m2[0][2], m2[0][3] );
1734
0
    m1[0][4] = _mm256_unpacklo_epi32( m2[0][4], m2[0][5] );
1735
0
    m1[0][5] = _mm256_unpackhi_epi32( m2[0][4], m2[0][5] );
1736
0
    m1[0][6] = _mm256_unpacklo_epi32( m2[0][6], m2[0][7] );
1737
0
    m1[0][7] = _mm256_unpackhi_epi32( m2[0][6], m2[0][7] );
1738
1739
0
    m2[0][0] = _mm256_unpacklo_epi64( m1[0][0], m1[0][2] );
1740
0
    m2[0][1] = _mm256_unpackhi_epi64( m1[0][0], m1[0][2] );
1741
0
    m2[0][2] = _mm256_unpacklo_epi64( m1[0][1], m1[0][3] );
1742
0
    m2[0][3] = _mm256_unpackhi_epi64( m1[0][1], m1[0][3] );
1743
0
    m2[0][4] = _mm256_unpacklo_epi64( m1[0][4], m1[0][6] );
1744
0
    m2[0][5] = _mm256_unpackhi_epi64( m1[0][4], m1[0][6] );
1745
0
    m2[0][6] = _mm256_unpacklo_epi64( m1[0][5], m1[0][7] );
1746
0
    m2[0][7] = _mm256_unpackhi_epi64( m1[0][5], m1[0][7] );
1747
1748
0
    __m256i vzero = _mm256_setzero_si256();
1749
0
    __m256i vtmp;
1750
1751
0
#define UNPACKX(x)                                        \
1752
0
    vtmp = _mm256_cmpgt_epi16( vzero, m2[0][x] );         \
1753
0
    m1[0][x] = _mm256_unpacklo_epi16( m2[0][x], vtmp );   \
1754
0
    m1[1][x] = _mm256_unpackhi_epi16( m2[0][x], vtmp );
1755
1756
0
    UNPACKX( 0 );
1757
0
    UNPACKX( 1 );
1758
0
    UNPACKX( 2 );
1759
0
    UNPACKX( 3 );
1760
0
    UNPACKX( 4 );
1761
0
    UNPACKX( 5 );
1762
0
    UNPACKX( 6 );
1763
0
    UNPACKX( 7 );
1764
1765
0
#undef UNPACKX
1766
1767
0
    for( int i = 0; i < 2; i++ )
1768
0
    {
1769
0
      m2[i][0] = _mm256_add_epi32( m1[i][0], m1[i][4] );
1770
0
      m2[i][1] = _mm256_add_epi32( m1[i][1], m1[i][5] );
1771
0
      m2[i][2] = _mm256_add_epi32( m1[i][2], m1[i][6] );
1772
0
      m2[i][3] = _mm256_add_epi32( m1[i][3], m1[i][7] );
1773
0
      m2[i][4] = _mm256_sub_epi32( m1[i][0], m1[i][4] );
1774
0
      m2[i][5] = _mm256_sub_epi32( m1[i][1], m1[i][5] );
1775
0
      m2[i][6] = _mm256_sub_epi32( m1[i][2], m1[i][6] );
1776
0
      m2[i][7] = _mm256_sub_epi32( m1[i][3], m1[i][7] );
1777
1778
0
      m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][2] );
1779
0
      m1[i][1] = _mm256_add_epi32( m2[i][1], m2[i][3] );
1780
0
      m1[i][2] = _mm256_sub_epi32( m2[i][0], m2[i][2] );
1781
0
      m1[i][3] = _mm256_sub_epi32( m2[i][1], m2[i][3] );
1782
0
      m1[i][4] = _mm256_add_epi32( m2[i][4], m2[i][6] );
1783
0
      m1[i][5] = _mm256_add_epi32( m2[i][5], m2[i][7] );
1784
0
      m1[i][6] = _mm256_sub_epi32( m2[i][4], m2[i][6] );
1785
0
      m1[i][7] = _mm256_sub_epi32( m2[i][5], m2[i][7] );
1786
1787
0
      m2[i][0] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][0], m1[i][1] ) );
1788
0
      m2[i][1] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][0], m1[i][1] ) );
1789
0
      m2[i][2] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][2], m1[i][3] ) );
1790
0
      m2[i][3] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][2], m1[i][3] ) );
1791
0
      m2[i][4] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][4], m1[i][5] ) );
1792
0
      m2[i][5] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][4], m1[i][5] ) );
1793
0
      m2[i][6] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][6], m1[i][7] ) );
1794
0
      m2[i][7] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][6], m1[i][7] ) );
1795
0
    }
1796
1797
0
    uint32_t absDc0 = _mm_cvtsi128_si32( _mm256_castsi256_si128( m2[0][0] ) );
1798
0
    uint32_t absDc1 = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( m2[0][0], m2[0][0], 0x11 ) ) );
1799
1800
0
    for( int i = 0; i < 8; i++ )
1801
0
    {
1802
0
      m1[0][i] = _mm256_add_epi32( m2[0][i], m2[1][i] );
1803
0
    }
1804
1805
0
    m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][1] );
1806
0
    m1[0][2] = _mm256_add_epi32( m1[0][2], m1[0][3] );
1807
0
    m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][5] );
1808
0
    m1[0][6] = _mm256_add_epi32( m1[0][6], m1[0][7] );
1809
1810
0
    m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][2] );
1811
0
    m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][6] );
1812
1813
0
    __m256i iSum = _mm256_add_epi32( m1[0][0], m1[0][4] );
1814
1815
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
1816
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
1817
1818
0
    uint32_t tmp;
1819
0
    tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) );
1820
    // 16x16 block is done by adding together 4 8x8 SATDs
1821
0
    tmp -= absDc0;
1822
0
    tmp += absDc0 >> 2;
1823
0
    tmp = ( ( tmp + 2 ) >> 2 );
1824
0
    sad += tmp;
1825
1826
0
    tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ) );
1827
    // 16x16 block is done by adding together 4 8x8 SATDs
1828
0
    tmp -= absDc1;
1829
0
    tmp += absDc1 >> 2;
1830
0
    tmp = ( ( tmp + 2 ) >> 2 );
1831
0
    sad += tmp;
1832
0
  }
1833
1834
0
#endif
1835
0
  return ( sad << 2 );
1836
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD32x32_fast_AVX2(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD32x32_fast_AVX2(short const*, short const*, int, int, int)
1837
1838
static uint32_t xCalcHAD16x16_AVX2( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
1839
0
{
1840
0
  uint32_t sad = 0;
1841
1842
0
#ifdef USE_AVX2
1843
0
  const int iLoops = 2;
1844
0
  __m256i m1[2][8], m2[2][8];
1845
1846
0
  CHECK( iBitDepth > 10, "Only bitdepths up to 10 supported!" );
1847
1848
0
  for( int l = 0; l < iLoops; l++ )
1849
0
  {
1850
0
    for( int k = 0; k < 8; k++ )
1851
0
    {
1852
0
      __m256i r0 = _mm256_loadu_si256( ( __m256i* ) piOrg );
1853
0
      __m256i r1 = _mm256_loadu_si256( ( __m256i* ) piCur );
1854
0
      m2[0][k] = _mm256_sub_epi16( r0, r1 ); // 11 bit
1855
      //m2[1][k] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m2[0][k], 1 ) );
1856
      //m2[0][k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128( m2[0][k] ) );
1857
0
      piCur += iStrideCur;
1858
0
      piOrg += iStrideOrg;
1859
0
    }
1860
1861
0
    m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][4] );
1862
0
    m1[0][1] = _mm256_add_epi16( m2[0][1], m2[0][5] );
1863
0
    m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][6] );
1864
0
    m1[0][3] = _mm256_add_epi16( m2[0][3], m2[0][7] );
1865
0
    m1[0][4] = _mm256_sub_epi16( m2[0][0], m2[0][4] );
1866
0
    m1[0][5] = _mm256_sub_epi16( m2[0][1], m2[0][5] );
1867
0
    m1[0][6] = _mm256_sub_epi16( m2[0][2], m2[0][6] );
1868
0
    m1[0][7] = _mm256_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit
1869
1870
0
    m2[0][0] = _mm256_add_epi16( m1[0][0], m1[0][2] );
1871
0
    m2[0][1] = _mm256_add_epi16( m1[0][1], m1[0][3] );
1872
0
    m2[0][2] = _mm256_sub_epi16( m1[0][0], m1[0][2] );
1873
0
    m2[0][3] = _mm256_sub_epi16( m1[0][1], m1[0][3] );
1874
0
    m2[0][4] = _mm256_add_epi16( m1[0][4], m1[0][6] );
1875
0
    m2[0][5] = _mm256_add_epi16( m1[0][5], m1[0][7] );
1876
0
    m2[0][6] = _mm256_sub_epi16( m1[0][4], m1[0][6] );
1877
0
    m2[0][7] = _mm256_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit
1878
1879
0
    m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][1] );
1880
0
    m1[0][1] = _mm256_sub_epi16( m2[0][0], m2[0][1] );
1881
0
    m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][3] );
1882
0
    m1[0][3] = _mm256_sub_epi16( m2[0][2], m2[0][3] );
1883
0
    m1[0][4] = _mm256_add_epi16( m2[0][4], m2[0][5] );
1884
0
    m1[0][5] = _mm256_sub_epi16( m2[0][4], m2[0][5] );
1885
0
    m1[0][6] = _mm256_add_epi16( m2[0][6], m2[0][7] );
1886
0
    m1[0][7] = _mm256_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit
1887
1888
    // transpose
1889
    // 8x8
1890
0
    m2[0][0] = _mm256_unpacklo_epi16( m1[0][0], m1[0][1] );
1891
0
    m2[0][1] = _mm256_unpacklo_epi16( m1[0][2], m1[0][3] );
1892
0
    m2[0][2] = _mm256_unpacklo_epi16( m1[0][4], m1[0][5] );
1893
0
    m2[0][3] = _mm256_unpacklo_epi16( m1[0][6], m1[0][7] );
1894
0
    m2[0][4] = _mm256_unpackhi_epi16( m1[0][0], m1[0][1] );
1895
0
    m2[0][5] = _mm256_unpackhi_epi16( m1[0][2], m1[0][3] );
1896
0
    m2[0][6] = _mm256_unpackhi_epi16( m1[0][4], m1[0][5] );
1897
0
    m2[0][7] = _mm256_unpackhi_epi16( m1[0][6], m1[0][7] );
1898
1899
0
    m1[0][0] = _mm256_unpacklo_epi32( m2[0][0], m2[0][1] );
1900
0
    m1[0][1] = _mm256_unpackhi_epi32( m2[0][0], m2[0][1] );
1901
0
    m1[0][2] = _mm256_unpacklo_epi32( m2[0][2], m2[0][3] );
1902
0
    m1[0][3] = _mm256_unpackhi_epi32( m2[0][2], m2[0][3] );
1903
0
    m1[0][4] = _mm256_unpacklo_epi32( m2[0][4], m2[0][5] );
1904
0
    m1[0][5] = _mm256_unpackhi_epi32( m2[0][4], m2[0][5] );
1905
0
    m1[0][6] = _mm256_unpacklo_epi32( m2[0][6], m2[0][7] );
1906
0
    m1[0][7] = _mm256_unpackhi_epi32( m2[0][6], m2[0][7] );
1907
1908
0
    m2[0][0] = _mm256_unpacklo_epi64( m1[0][0], m1[0][2] );
1909
0
    m2[0][1] = _mm256_unpackhi_epi64( m1[0][0], m1[0][2] );
1910
0
    m2[0][2] = _mm256_unpacklo_epi64( m1[0][1], m1[0][3] );
1911
0
    m2[0][3] = _mm256_unpackhi_epi64( m1[0][1], m1[0][3] );
1912
0
    m2[0][4] = _mm256_unpacklo_epi64( m1[0][4], m1[0][6] );
1913
0
    m2[0][5] = _mm256_unpackhi_epi64( m1[0][4], m1[0][6] );
1914
0
    m2[0][6] = _mm256_unpacklo_epi64( m1[0][5], m1[0][7] );
1915
0
    m2[0][7] = _mm256_unpackhi_epi64( m1[0][5], m1[0][7] );
1916
1917
0
    __m256i vzero = _mm256_setzero_si256();
1918
0
    __m256i vtmp;
1919
1920
0
#define UNPACKX(x)                                        \
1921
0
    vtmp = _mm256_cmpgt_epi16( vzero, m2[0][x] );         \
1922
0
    m1[0][x] = _mm256_unpacklo_epi16( m2[0][x], vtmp );   \
1923
0
    m1[1][x] = _mm256_unpackhi_epi16( m2[0][x], vtmp );
1924
1925
0
    UNPACKX( 0 );
1926
0
    UNPACKX( 1 );
1927
0
    UNPACKX( 2 );
1928
0
    UNPACKX( 3 );
1929
0
    UNPACKX( 4 );
1930
0
    UNPACKX( 5 );
1931
0
    UNPACKX( 6 );
1932
0
    UNPACKX( 7 );
1933
1934
0
#undef UNPACKX
1935
1936
0
    for( int i = 0; i < 2; i++ )
1937
0
    {
1938
0
      m2[i][0] = _mm256_add_epi32( m1[i][0], m1[i][4] );
1939
0
      m2[i][1] = _mm256_add_epi32( m1[i][1], m1[i][5] );
1940
0
      m2[i][2] = _mm256_add_epi32( m1[i][2], m1[i][6] );
1941
0
      m2[i][3] = _mm256_add_epi32( m1[i][3], m1[i][7] );
1942
0
      m2[i][4] = _mm256_sub_epi32( m1[i][0], m1[i][4] );
1943
0
      m2[i][5] = _mm256_sub_epi32( m1[i][1], m1[i][5] );
1944
0
      m2[i][6] = _mm256_sub_epi32( m1[i][2], m1[i][6] );
1945
0
      m2[i][7] = _mm256_sub_epi32( m1[i][3], m1[i][7] );
1946
1947
0
      m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][2] );
1948
0
      m1[i][1] = _mm256_add_epi32( m2[i][1], m2[i][3] );
1949
0
      m1[i][2] = _mm256_sub_epi32( m2[i][0], m2[i][2] );
1950
0
      m1[i][3] = _mm256_sub_epi32( m2[i][1], m2[i][3] );
1951
0
      m1[i][4] = _mm256_add_epi32( m2[i][4], m2[i][6] );
1952
0
      m1[i][5] = _mm256_add_epi32( m2[i][5], m2[i][7] );
1953
0
      m1[i][6] = _mm256_sub_epi32( m2[i][4], m2[i][6] );
1954
0
      m1[i][7] = _mm256_sub_epi32( m2[i][5], m2[i][7] );
1955
1956
0
      m2[i][0] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][0], m1[i][1] ) );
1957
0
      m2[i][1] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][0], m1[i][1] ) );
1958
0
      m2[i][2] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][2], m1[i][3] ) );
1959
0
      m2[i][3] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][2], m1[i][3] ) );
1960
0
      m2[i][4] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][4], m1[i][5] ) );
1961
0
      m2[i][5] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][4], m1[i][5] ) );
1962
0
      m2[i][6] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][6], m1[i][7] ) );
1963
0
      m2[i][7] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][6], m1[i][7] ) );
1964
0
    }
1965
1966
0
    uint32_t absDc0 = _mm_cvtsi128_si32( _mm256_castsi256_si128( m2[0][0] ) );
1967
0
    uint32_t absDc1 = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( m2[0][0], m2[0][0], 0x11 ) ) );
1968
1969
0
    for( int i = 0; i < 8; i++ )
1970
0
    {
1971
0
      m1[0][i] = _mm256_add_epi32( m2[0][i], m2[1][i] );
1972
0
    }
1973
1974
0
    m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][1] );
1975
0
    m1[0][2] = _mm256_add_epi32( m1[0][2], m1[0][3] );
1976
0
    m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][5] );
1977
0
    m1[0][6] = _mm256_add_epi32( m1[0][6], m1[0][7] );
1978
1979
0
    m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][2] );
1980
0
    m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][6] );
1981
1982
0
    __m256i iSum = _mm256_add_epi32( m1[0][0], m1[0][4] );
1983
1984
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
1985
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
1986
1987
0
    uint32_t tmp;
1988
0
    tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) );
1989
    // 16x16 block is done by adding together 4 8x8 SATDs
1990
0
    tmp -= absDc0;
1991
0
    tmp += absDc0 >> 2;
1992
0
    tmp = ( ( tmp + 2 ) >> 2 );
1993
0
    sad += tmp;
1994
1995
0
    tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ) );
1996
    // 16x16 block is done by adding together 4 8x8 SATDs
1997
0
    tmp -= absDc1;
1998
0
    tmp += absDc1 >> 2;
1999
0
    tmp = ( ( tmp + 2 ) >> 2 );
2000
0
    sad += tmp;
2001
0
  }
2002
2003
0
#endif
2004
0
  return ( sad );
2005
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x16_AVX2(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x16_AVX2(short const*, short const*, int, int, int)
2006
2007
static uint32_t xCalcHAD16x8_AVX2( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
2008
0
{
2009
0
  uint32_t sad = 0;
2010
2011
0
#ifdef USE_AVX2
2012
0
  __m256i m1[16], m2[16];
2013
2014
0
  CHECK( iBitDepth > 10, "Only bitdepths up to 10 supported!" );
2015
2016
0
  {
2017
0
    for( int k = 0; k < 8; k++ )
2018
0
    {
2019
0
      __m256i r0 = _mm256_lddqu_si256( (__m256i*)piOrg );
2020
0
      __m256i r1 = _mm256_lddqu_si256( (__m256i*)piCur );
2021
0
      m1[k]   = _mm256_sub_epi16( r0, r1 ); // 11 bit
2022
      //m1[k+8] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m1[k], 1 ) );
2023
      //m1[k]   = _mm256_cvtepi16_epi32( _mm256_castsi256_si128  ( m1[k]    ) );
2024
0
      piCur += iStrideCur;
2025
0
      piOrg += iStrideOrg;
2026
0
    }
2027
2028
    // vertical, first 8x8
2029
#if 0
2030
    m2[0] = _mm256_add_epi32( m1[0], m1[4] );
2031
    m2[1] = _mm256_add_epi32( m1[1], m1[5] );
2032
    m2[2] = _mm256_add_epi32( m1[2], m1[6] );
2033
    m2[3] = _mm256_add_epi32( m1[3], m1[7] );
2034
    m2[4] = _mm256_sub_epi32( m1[0], m1[4] );
2035
    m2[5] = _mm256_sub_epi32( m1[1], m1[5] );
2036
    m2[6] = _mm256_sub_epi32( m1[2], m1[6] );
2037
    m2[7] = _mm256_sub_epi32( m1[3], m1[7] );
2038
2039
    m1[0] = _mm256_add_epi32( m2[0], m2[2] );
2040
    m1[1] = _mm256_add_epi32( m2[1], m2[3] );
2041
    m1[2] = _mm256_sub_epi32( m2[0], m2[2] );
2042
    m1[3] = _mm256_sub_epi32( m2[1], m2[3] );
2043
    m1[4] = _mm256_add_epi32( m2[4], m2[6] );
2044
    m1[5] = _mm256_add_epi32( m2[5], m2[7] );
2045
    m1[6] = _mm256_sub_epi32( m2[4], m2[6] );
2046
    m1[7] = _mm256_sub_epi32( m2[5], m2[7] );
2047
2048
    m2[0] = _mm256_add_epi32( m1[0], m1[1] );
2049
    m2[1] = _mm256_sub_epi32( m1[0], m1[1] );
2050
    m2[2] = _mm256_add_epi32( m1[2], m1[3] );
2051
    m2[3] = _mm256_sub_epi32( m1[2], m1[3] );
2052
    m2[4] = _mm256_add_epi32( m1[4], m1[5] );
2053
    m2[5] = _mm256_sub_epi32( m1[4], m1[5] );
2054
    m2[6] = _mm256_add_epi32( m1[6], m1[7] );
2055
    m2[7] = _mm256_sub_epi32( m1[6], m1[7] );
2056
2057
    // vertical, second 8x8
2058
    m2[8+0] = _mm256_add_epi32( m1[8+0], m1[8+4] );
2059
    m2[8+1] = _mm256_add_epi32( m1[8+1], m1[8+5] );
2060
    m2[8+2] = _mm256_add_epi32( m1[8+2], m1[8+6] );
2061
    m2[8+3] = _mm256_add_epi32( m1[8+3], m1[8+7] );
2062
    m2[8+4] = _mm256_sub_epi32( m1[8+0], m1[8+4] );
2063
    m2[8+5] = _mm256_sub_epi32( m1[8+1], m1[8+5] );
2064
    m2[8+6] = _mm256_sub_epi32( m1[8+2], m1[8+6] );
2065
    m2[8+7] = _mm256_sub_epi32( m1[8+3], m1[8+7] );
2066
2067
    m1[8+0] = _mm256_add_epi32( m2[8+0], m2[8+2] );
2068
    m1[8+1] = _mm256_add_epi32( m2[8+1], m2[8+3] );
2069
    m1[8+2] = _mm256_sub_epi32( m2[8+0], m2[8+2] );
2070
    m1[8+3] = _mm256_sub_epi32( m2[8+1], m2[8+3] );
2071
    m1[8+4] = _mm256_add_epi32( m2[8+4], m2[8+6] );
2072
    m1[8+5] = _mm256_add_epi32( m2[8+5], m2[8+7] );
2073
    m1[8+6] = _mm256_sub_epi32( m2[8+4], m2[8+6] );
2074
    m1[8+7] = _mm256_sub_epi32( m2[8+5], m2[8+7] );
2075
2076
    m2[8+0] = _mm256_add_epi32( m1[8+0], m1[8+1] );
2077
    m2[8+1] = _mm256_sub_epi32( m1[8+0], m1[8+1] );
2078
    m2[8+2] = _mm256_add_epi32( m1[8+2], m1[8+3] );
2079
    m2[8+3] = _mm256_sub_epi32( m1[8+2], m1[8+3] );
2080
    m2[8+4] = _mm256_add_epi32( m1[8+4], m1[8+5] );
2081
    m2[8+5] = _mm256_sub_epi32( m1[8+4], m1[8+5] );
2082
    m2[8+6] = _mm256_add_epi32( m1[8+6], m1[8+7] );
2083
    m2[8+7] = _mm256_sub_epi32( m1[8+6], m1[8+7] );
2084
2085
    // transpose
2086
    constexpr int perm_unpacklo_epi128 = ( 0 << 0 ) + ( 2 << 4 );
2087
    constexpr int perm_unpackhi_epi128 = ( 1 << 0 ) + ( 3 << 4 );
2088
2089
    m1[0] = _mm256_unpacklo_epi32( m2[0], m2[1] );
2090
    m1[1] = _mm256_unpacklo_epi32( m2[2], m2[3] );
2091
    m1[2] = _mm256_unpacklo_epi32( m2[4], m2[5] );
2092
    m1[3] = _mm256_unpacklo_epi32( m2[6], m2[7] );
2093
    m1[4] = _mm256_unpackhi_epi32( m2[0], m2[1] );
2094
    m1[5] = _mm256_unpackhi_epi32( m2[2], m2[3] );
2095
    m1[6] = _mm256_unpackhi_epi32( m2[4], m2[5] );
2096
    m1[7] = _mm256_unpackhi_epi32( m2[6], m2[7] );
2097
2098
    m2[0] = _mm256_unpacklo_epi64( m1[0], m1[1] );
2099
    m2[1] = _mm256_unpackhi_epi64( m1[0], m1[1] );
2100
    m2[2] = _mm256_unpacklo_epi64( m1[2], m1[3] );
2101
    m2[3] = _mm256_unpackhi_epi64( m1[2], m1[3] );
2102
    m2[4] = _mm256_unpacklo_epi64( m1[4], m1[5] );
2103
    m2[5] = _mm256_unpackhi_epi64( m1[4], m1[5] );
2104
    m2[6] = _mm256_unpacklo_epi64( m1[6], m1[7] );
2105
    m2[7] = _mm256_unpackhi_epi64( m1[6], m1[7] );
2106
2107
    m1[0] = _mm256_permute2x128_si256( m2[0], m2[2], perm_unpacklo_epi128 );
2108
    m1[1] = _mm256_permute2x128_si256( m2[0], m2[2], perm_unpackhi_epi128 );
2109
    m1[2] = _mm256_permute2x128_si256( m2[1], m2[3], perm_unpacklo_epi128 );
2110
    m1[3] = _mm256_permute2x128_si256( m2[1], m2[3], perm_unpackhi_epi128 );
2111
    m1[4] = _mm256_permute2x128_si256( m2[4], m2[6], perm_unpacklo_epi128 );
2112
    m1[5] = _mm256_permute2x128_si256( m2[4], m2[6], perm_unpackhi_epi128 );
2113
    m1[6] = _mm256_permute2x128_si256( m2[5], m2[7], perm_unpacklo_epi128 );
2114
    m1[7] = _mm256_permute2x128_si256( m2[5], m2[7], perm_unpackhi_epi128 );
2115
2116
    m1[8+0] = _mm256_unpacklo_epi32( m2[8+0], m2[8+1] );
2117
    m1[8+1] = _mm256_unpacklo_epi32( m2[8+2], m2[8+3] );
2118
    m1[8+2] = _mm256_unpacklo_epi32( m2[8+4], m2[8+5] );
2119
    m1[8+3] = _mm256_unpacklo_epi32( m2[8+6], m2[8+7] );
2120
    m1[8+4] = _mm256_unpackhi_epi32( m2[8+0], m2[8+1] );
2121
    m1[8+5] = _mm256_unpackhi_epi32( m2[8+2], m2[8+3] );
2122
    m1[8+6] = _mm256_unpackhi_epi32( m2[8+4], m2[8+5] );
2123
    m1[8+7] = _mm256_unpackhi_epi32( m2[8+6], m2[8+7] );
2124
2125
    m2[8+0] = _mm256_unpacklo_epi64( m1[8+0], m1[8+1] );
2126
    m2[8+1] = _mm256_unpackhi_epi64( m1[8+0], m1[8+1] );
2127
    m2[8+2] = _mm256_unpacklo_epi64( m1[8+2], m1[8+3] );
2128
    m2[8+3] = _mm256_unpackhi_epi64( m1[8+2], m1[8+3] );
2129
    m2[8+4] = _mm256_unpacklo_epi64( m1[8+4], m1[8+5] );
2130
    m2[8+5] = _mm256_unpackhi_epi64( m1[8+4], m1[8+5] );
2131
    m2[8+6] = _mm256_unpacklo_epi64( m1[8+6], m1[8+7] );
2132
    m2[8+7] = _mm256_unpackhi_epi64( m1[8+6], m1[8+7] );
2133
2134
    m1[8+0] = _mm256_permute2x128_si256( m2[8+0], m2[8+2], perm_unpacklo_epi128 );
2135
    m1[8+1] = _mm256_permute2x128_si256( m2[8+0], m2[8+2], perm_unpackhi_epi128 );
2136
    m1[8+2] = _mm256_permute2x128_si256( m2[8+1], m2[8+3], perm_unpacklo_epi128 );
2137
    m1[8+3] = _mm256_permute2x128_si256( m2[8+1], m2[8+3], perm_unpackhi_epi128 );
2138
    m1[8+4] = _mm256_permute2x128_si256( m2[8+4], m2[8+6], perm_unpacklo_epi128 );
2139
    m1[8+5] = _mm256_permute2x128_si256( m2[8+4], m2[8+6], perm_unpackhi_epi128 );
2140
    m1[8+6] = _mm256_permute2x128_si256( m2[8+5], m2[8+7], perm_unpacklo_epi128 );
2141
    m1[8+7] = _mm256_permute2x128_si256( m2[8+5], m2[8+7], perm_unpackhi_epi128 );
2142
#else
2143
0
    m2[0] = _mm256_add_epi16( m1[0], m1[4] );
2144
0
    m2[1] = _mm256_add_epi16( m1[1], m1[5] );
2145
0
    m2[2] = _mm256_add_epi16( m1[2], m1[6] );
2146
0
    m2[3] = _mm256_add_epi16( m1[3], m1[7] );
2147
0
    m2[4] = _mm256_sub_epi16( m1[0], m1[4] );
2148
0
    m2[5] = _mm256_sub_epi16( m1[1], m1[5] );
2149
0
    m2[6] = _mm256_sub_epi16( m1[2], m1[6] );
2150
0
    m2[7] = _mm256_sub_epi16( m1[3], m1[7] ); // 12 bit
2151
2152
0
    m1[0] = _mm256_add_epi16( m2[0], m2[2] );
2153
0
    m1[1] = _mm256_add_epi16( m2[1], m2[3] );
2154
0
    m1[2] = _mm256_sub_epi16( m2[0], m2[2] );
2155
0
    m1[3] = _mm256_sub_epi16( m2[1], m2[3] );
2156
0
    m1[4] = _mm256_add_epi16( m2[4], m2[6] );
2157
0
    m1[5] = _mm256_add_epi16( m2[5], m2[7] );
2158
0
    m1[6] = _mm256_sub_epi16( m2[4], m2[6] );
2159
0
    m1[7] = _mm256_sub_epi16( m2[5], m2[7] ); // 13 bit
2160
2161
0
    m2[0] = _mm256_add_epi16( m1[0], m1[1] );
2162
0
    m2[1] = _mm256_sub_epi16( m1[0], m1[1] );
2163
0
    m2[2] = _mm256_add_epi16( m1[2], m1[3] );
2164
0
    m2[3] = _mm256_sub_epi16( m1[2], m1[3] );
2165
0
    m2[4] = _mm256_add_epi16( m1[4], m1[5] );
2166
0
    m2[5] = _mm256_sub_epi16( m1[4], m1[5] );
2167
0
    m2[6] = _mm256_add_epi16( m1[6], m1[7] );
2168
0
    m2[7] = _mm256_sub_epi16( m1[6], m1[7] ); // 14 bit
2169
2170
0
    m1[0] = _mm256_unpacklo_epi16( m2[0], m2[1] );
2171
0
    m1[1] = _mm256_unpacklo_epi16( m2[2], m2[3] );
2172
0
    m1[2] = _mm256_unpacklo_epi16( m2[4], m2[5] );
2173
0
    m1[3] = _mm256_unpacklo_epi16( m2[6], m2[7] );
2174
0
    m1[4] = _mm256_unpackhi_epi16( m2[0], m2[1] );
2175
0
    m1[5] = _mm256_unpackhi_epi16( m2[2], m2[3] );
2176
0
    m1[6] = _mm256_unpackhi_epi16( m2[4], m2[5] );
2177
0
    m1[7] = _mm256_unpackhi_epi16( m2[6], m2[7] );
2178
2179
0
    m2[0] = _mm256_unpacklo_epi32( m1[0], m1[1] );
2180
0
    m2[1] = _mm256_unpackhi_epi32( m1[0], m1[1] );
2181
0
    m2[2] = _mm256_unpacklo_epi32( m1[2], m1[3] );
2182
0
    m2[3] = _mm256_unpackhi_epi32( m1[2], m1[3] );
2183
0
    m2[4] = _mm256_unpacklo_epi32( m1[4], m1[5] );
2184
0
    m2[5] = _mm256_unpackhi_epi32( m1[4], m1[5] );
2185
0
    m2[6] = _mm256_unpacklo_epi32( m1[6], m1[7] );
2186
0
    m2[7] = _mm256_unpackhi_epi32( m1[6], m1[7] );
2187
2188
0
    m1[0] = _mm256_unpacklo_epi64( m2[0], m2[2] );
2189
0
    m1[1] = _mm256_unpackhi_epi64( m2[0], m2[2] );
2190
0
    m1[2] = _mm256_unpacklo_epi64( m2[1], m2[3] );
2191
0
    m1[3] = _mm256_unpackhi_epi64( m2[1], m2[3] );
2192
0
    m1[4] = _mm256_unpacklo_epi64( m2[4], m2[6] );
2193
0
    m1[5] = _mm256_unpackhi_epi64( m2[4], m2[6] );
2194
0
    m1[6] = _mm256_unpacklo_epi64( m2[5], m2[7] );
2195
0
    m1[7] = _mm256_unpackhi_epi64( m2[5], m2[7] );
2196
    
2197
0
    for( int k = 0; k < 8; k++ )
2198
0
    {
2199
0
      m1[k+8] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m1[k], 1 ) );
2200
0
      m1[k]   = _mm256_cvtepi16_epi32( _mm256_castsi256_si128  ( m1[k]    ) );
2201
0
    }
2202
0
#endif
2203
2204
    // horizontal
2205
0
    {
2206
0
      m2[ 0] = _mm256_add_epi32( m1[0], m1[ 8] );
2207
0
      m2[ 1] = _mm256_add_epi32( m1[1], m1[ 9] );
2208
0
      m2[ 2] = _mm256_add_epi32( m1[2], m1[10] );
2209
0
      m2[ 3] = _mm256_add_epi32( m1[3], m1[11] );
2210
0
      m2[ 4] = _mm256_add_epi32( m1[4], m1[12] );
2211
0
      m2[ 5] = _mm256_add_epi32( m1[5], m1[13] );
2212
0
      m2[ 6] = _mm256_add_epi32( m1[6], m1[14] );
2213
0
      m2[ 7] = _mm256_add_epi32( m1[7], m1[15] );
2214
0
      m2[ 8] = _mm256_sub_epi32( m1[0], m1[ 8] );
2215
0
      m2[ 9] = _mm256_sub_epi32( m1[1], m1[ 9] );
2216
0
      m2[10] = _mm256_sub_epi32( m1[2], m1[10] );
2217
0
      m2[11] = _mm256_sub_epi32( m1[3], m1[11] );
2218
0
      m2[12] = _mm256_sub_epi32( m1[4], m1[12] );
2219
0
      m2[13] = _mm256_sub_epi32( m1[5], m1[13] );
2220
0
      m2[14] = _mm256_sub_epi32( m1[6], m1[14] );
2221
0
      m2[15] = _mm256_sub_epi32( m1[7], m1[15] );
2222
2223
0
      m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 4] );
2224
0
      m1[ 1] = _mm256_add_epi32( m2[ 1], m2[ 5] );
2225
0
      m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 6] );
2226
0
      m1[ 3] = _mm256_add_epi32( m2[ 3], m2[ 7] );
2227
0
      m1[ 4] = _mm256_sub_epi32( m2[ 0], m2[ 4] );
2228
0
      m1[ 5] = _mm256_sub_epi32( m2[ 1], m2[ 5] );
2229
0
      m1[ 6] = _mm256_sub_epi32( m2[ 2], m2[ 6] );
2230
0
      m1[ 7] = _mm256_sub_epi32( m2[ 3], m2[ 7] );
2231
0
      m1[ 8] = _mm256_add_epi32( m2[ 8], m2[12] );
2232
0
      m1[ 9] = _mm256_add_epi32( m2[ 9], m2[13] );
2233
0
      m1[10] = _mm256_add_epi32( m2[10], m2[14] );
2234
0
      m1[11] = _mm256_add_epi32( m2[11], m2[15] );
2235
0
      m1[12] = _mm256_sub_epi32( m2[ 8], m2[12] );
2236
0
      m1[13] = _mm256_sub_epi32( m2[ 9], m2[13] );
2237
0
      m1[14] = _mm256_sub_epi32( m2[10], m2[14] );
2238
0
      m1[15] = _mm256_sub_epi32( m2[11], m2[15] );
2239
2240
0
      m2[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] );
2241
0
      m2[ 1] = _mm256_add_epi32( m1[ 1], m1[ 3] );
2242
0
      m2[ 2] = _mm256_sub_epi32( m1[ 0], m1[ 2] );
2243
0
      m2[ 3] = _mm256_sub_epi32( m1[ 1], m1[ 3] );
2244
0
      m2[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] );
2245
0
      m2[ 5] = _mm256_add_epi32( m1[ 5], m1[ 7] );
2246
0
      m2[ 6] = _mm256_sub_epi32( m1[ 4], m1[ 6] );
2247
0
      m2[ 7] = _mm256_sub_epi32( m1[ 5], m1[ 7] );
2248
0
      m2[ 8] = _mm256_add_epi32( m1[ 8], m1[10] );
2249
0
      m2[ 9] = _mm256_add_epi32( m1[ 9], m1[11] );
2250
0
      m2[10] = _mm256_sub_epi32( m1[ 8], m1[10] );
2251
0
      m2[11] = _mm256_sub_epi32( m1[ 9], m1[11] );
2252
0
      m2[12] = _mm256_add_epi32( m1[12], m1[14] );
2253
0
      m2[13] = _mm256_add_epi32( m1[13], m1[15] );
2254
0
      m2[14] = _mm256_sub_epi32( m1[12], m1[14] );
2255
0
      m2[15] = _mm256_sub_epi32( m1[13], m1[15] );
2256
2257
0
      m1[ 0] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 0], m2[ 1] ) );
2258
0
      m1[ 1] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 0], m2[ 1] ) );
2259
0
      m1[ 2] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 2], m2[ 3] ) );
2260
0
      m1[ 3] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 2], m2[ 3] ) );
2261
0
      m1[ 4] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 4], m2[ 5] ) );
2262
0
      m1[ 5] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 4], m2[ 5] ) );
2263
0
      m1[ 6] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 6], m2[ 7] ) );
2264
0
      m1[ 7] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 6], m2[ 7] ) );
2265
0
      m1[ 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 8], m2[ 9] ) );
2266
0
      m1[ 9] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 8], m2[ 9] ) );
2267
0
      m1[10] = _mm256_abs_epi32( _mm256_add_epi32( m2[10], m2[11] ) );
2268
0
      m1[11] = _mm256_abs_epi32( _mm256_sub_epi32( m2[10], m2[11] ) );
2269
0
      m1[12] = _mm256_abs_epi32( _mm256_add_epi32( m2[12], m2[13] ) );
2270
0
      m1[13] = _mm256_abs_epi32( _mm256_sub_epi32( m2[12], m2[13] ) );
2271
0
      m1[14] = _mm256_abs_epi32( _mm256_add_epi32( m2[14], m2[15] ) );
2272
0
      m1[15] = _mm256_abs_epi32( _mm256_sub_epi32( m2[14], m2[15] ) );
2273
0
    }
2274
2275
0
    uint32_t absDc = _mm_cvtsi128_si32( _mm256_castsi256_si128( m1[0] ) );
2276
2277
    // sum up
2278
0
    m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] );
2279
0
    m1[ 2] = _mm256_add_epi32( m1[ 2], m1[ 3] );
2280
0
    m1[ 4] = _mm256_add_epi32( m1[ 4], m1[ 5] );
2281
0
    m1[ 6] = _mm256_add_epi32( m1[ 6], m1[ 7] );
2282
0
    m1[ 8] = _mm256_add_epi32( m1[ 8], m1[ 9] );
2283
0
    m1[10] = _mm256_add_epi32( m1[10], m1[11] );
2284
0
    m1[12] = _mm256_add_epi32( m1[12], m1[13] );
2285
0
    m1[14] = _mm256_add_epi32( m1[14], m1[15] );
2286
2287
0
    m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] );
2288
0
    m1[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] );
2289
0
    m1[ 8] = _mm256_add_epi32( m1[ 8], m1[10] );
2290
0
    m1[12] = _mm256_add_epi32( m1[12], m1[14] );
2291
2292
0
    m1[0] = _mm256_add_epi32( m1[0], m1[ 4] );
2293
0
    m1[8] = _mm256_add_epi32( m1[8], m1[12] );
2294
2295
0
    __m256i iSum = _mm256_add_epi32( m1[0], m1[8] );
2296
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
2297
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
2298
0
    iSum = _mm256_add_epi32( iSum, _mm256_permute2x128_si256( iSum, iSum, 0x11 ) );
2299
2300
0
    sad = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) );
2301
    
2302
0
    sad -= absDc;
2303
0
    sad += absDc >> 2;
2304
0
    sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
2305
0
  }
2306
2307
0
#endif //USE_AVX2
2308
2309
0
  return (sad);
2310
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x8_AVX2(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x8_AVX2(short const*, short const*, int, int, int)
2311
2312
static uint32_t xCalcHAD8x16_AVX2( const Pel* piOrg, const Pel* piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
2313
0
{
2314
0
  uint32_t sad = 0;
2315
2316
0
#ifdef USE_AVX2
2317
0
  __m256i m1[16], m2[16];
2318
2319
0
  {
2320
0
    {
2321
0
      for( int k = 0; k < 16; k++ )
2322
0
      {
2323
0
        __m256i r0 = _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*)piOrg ) );
2324
0
        __m256i r1 = _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*)piCur ) );
2325
0
        m1[k] = _mm256_sub_epi32( r0, r1 );
2326
0
        piCur += iStrideCur;
2327
0
        piOrg += iStrideOrg;
2328
0
      }
2329
0
    }
2330
2331
    // vertical
2332
2333
0
    m2[ 0] = _mm256_add_epi32( m1[0], m1[ 8] );
2334
0
    m2[ 1] = _mm256_add_epi32( m1[1], m1[ 9] );
2335
0
    m2[ 2] = _mm256_add_epi32( m1[2], m1[10] );
2336
0
    m2[ 3] = _mm256_add_epi32( m1[3], m1[11] );
2337
0
    m2[ 4] = _mm256_add_epi32( m1[4], m1[12] );
2338
0
    m2[ 5] = _mm256_add_epi32( m1[5], m1[13] );
2339
0
    m2[ 6] = _mm256_add_epi32( m1[6], m1[14] );
2340
0
    m2[ 7] = _mm256_add_epi32( m1[7], m1[15] );
2341
0
    m2[ 8] = _mm256_sub_epi32( m1[0], m1[ 8] );
2342
0
    m2[ 9] = _mm256_sub_epi32( m1[1], m1[ 9] );
2343
0
    m2[10] = _mm256_sub_epi32( m1[2], m1[10] );
2344
0
    m2[11] = _mm256_sub_epi32( m1[3], m1[11] );
2345
0
    m2[12] = _mm256_sub_epi32( m1[4], m1[12] );
2346
0
    m2[13] = _mm256_sub_epi32( m1[5], m1[13] );
2347
0
    m2[14] = _mm256_sub_epi32( m1[6], m1[14] );
2348
0
    m2[15] = _mm256_sub_epi32( m1[7], m1[15] );
2349
2350
0
    m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 4] );
2351
0
    m1[ 1] = _mm256_add_epi32( m2[ 1], m2[ 5] );
2352
0
    m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 6] );
2353
0
    m1[ 3] = _mm256_add_epi32( m2[ 3], m2[ 7] );
2354
0
    m1[ 4] = _mm256_sub_epi32( m2[ 0], m2[ 4] );
2355
0
    m1[ 5] = _mm256_sub_epi32( m2[ 1], m2[ 5] );
2356
0
    m1[ 6] = _mm256_sub_epi32( m2[ 2], m2[ 6] );
2357
0
    m1[ 7] = _mm256_sub_epi32( m2[ 3], m2[ 7] );
2358
0
    m1[ 8] = _mm256_add_epi32( m2[ 8], m2[12] );
2359
0
    m1[ 9] = _mm256_add_epi32( m2[ 9], m2[13] );
2360
0
    m1[10] = _mm256_add_epi32( m2[10], m2[14] );
2361
0
    m1[11] = _mm256_add_epi32( m2[11], m2[15] );
2362
0
    m1[12] = _mm256_sub_epi32( m2[ 8], m2[12] );
2363
0
    m1[13] = _mm256_sub_epi32( m2[ 9], m2[13] );
2364
0
    m1[14] = _mm256_sub_epi32( m2[10], m2[14] );
2365
0
    m1[15] = _mm256_sub_epi32( m2[11], m2[15] );
2366
2367
0
    m2[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] );
2368
0
    m2[ 1] = _mm256_add_epi32( m1[ 1], m1[ 3] );
2369
0
    m2[ 2] = _mm256_sub_epi32( m1[ 0], m1[ 2] );
2370
0
    m2[ 3] = _mm256_sub_epi32( m1[ 1], m1[ 3] );
2371
0
    m2[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] );
2372
0
    m2[ 5] = _mm256_add_epi32( m1[ 5], m1[ 7] );
2373
0
    m2[ 6] = _mm256_sub_epi32( m1[ 4], m1[ 6] );
2374
0
    m2[ 7] = _mm256_sub_epi32( m1[ 5], m1[ 7] );
2375
0
    m2[ 8] = _mm256_add_epi32( m1[ 8], m1[10] );
2376
0
    m2[ 9] = _mm256_add_epi32( m1[ 9], m1[11] );
2377
0
    m2[10] = _mm256_sub_epi32( m1[ 8], m1[10] );
2378
0
    m2[11] = _mm256_sub_epi32( m1[ 9], m1[11] );
2379
0
    m2[12] = _mm256_add_epi32( m1[12], m1[14] );
2380
0
    m2[13] = _mm256_add_epi32( m1[13], m1[15] );
2381
0
    m2[14] = _mm256_sub_epi32( m1[12], m1[14] );
2382
0
    m2[15] = _mm256_sub_epi32( m1[13], m1[15] );
2383
2384
0
    m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 1] );
2385
0
    m1[ 1] = _mm256_sub_epi32( m2[ 0], m2[ 1] );
2386
0
    m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 3] );
2387
0
    m1[ 3] = _mm256_sub_epi32( m2[ 2], m2[ 3] );
2388
0
    m1[ 4] = _mm256_add_epi32( m2[ 4], m2[ 5] );
2389
0
    m1[ 5] = _mm256_sub_epi32( m2[ 4], m2[ 5] );
2390
0
    m1[ 6] = _mm256_add_epi32( m2[ 6], m2[ 7] );
2391
0
    m1[ 7] = _mm256_sub_epi32( m2[ 6], m2[ 7] );
2392
0
    m1[ 8] = _mm256_add_epi32( m2[ 8], m2[ 9] );
2393
0
    m1[ 9] = _mm256_sub_epi32( m2[ 8], m2[ 9] );
2394
0
    m1[10] = _mm256_add_epi32( m2[10], m2[11] );
2395
0
    m1[11] = _mm256_sub_epi32( m2[10], m2[11] );
2396
0
    m1[12] = _mm256_add_epi32( m2[12], m2[13] );
2397
0
    m1[13] = _mm256_sub_epi32( m2[12], m2[13] );
2398
0
    m1[14] = _mm256_add_epi32( m2[14], m2[15] );
2399
0
    m1[15] = _mm256_sub_epi32( m2[14], m2[15] );
2400
2401
    // transpose
2402
0
    #define perm_unpacklo_epi128 ( ( 0 << 0 ) + ( 2 << 4 ) )
2403
0
    #define perm_unpackhi_epi128 ( ( 1 << 0 ) + ( 3 << 4 ) )
2404
2405
    // 1. 8x8
2406
0
    m2[0] = _mm256_unpacklo_epi32( m1[0], m1[1] );
2407
0
    m2[1] = _mm256_unpacklo_epi32( m1[2], m1[3] );
2408
0
    m2[2] = _mm256_unpacklo_epi32( m1[4], m1[5] );
2409
0
    m2[3] = _mm256_unpacklo_epi32( m1[6], m1[7] );
2410
0
    m2[4] = _mm256_unpackhi_epi32( m1[0], m1[1] );
2411
0
    m2[5] = _mm256_unpackhi_epi32( m1[2], m1[3] );
2412
0
    m2[6] = _mm256_unpackhi_epi32( m1[4], m1[5] );
2413
0
    m2[7] = _mm256_unpackhi_epi32( m1[6], m1[7] );
2414
2415
0
    m1[0] = _mm256_unpacklo_epi64( m2[0], m2[1] );
2416
0
    m1[1] = _mm256_unpackhi_epi64( m2[0], m2[1] );
2417
0
    m1[2] = _mm256_unpacklo_epi64( m2[2], m2[3] );
2418
0
    m1[3] = _mm256_unpackhi_epi64( m2[2], m2[3] );
2419
0
    m1[4] = _mm256_unpacklo_epi64( m2[4], m2[5] );
2420
0
    m1[5] = _mm256_unpackhi_epi64( m2[4], m2[5] );
2421
0
    m1[6] = _mm256_unpacklo_epi64( m2[6], m2[7] );
2422
0
    m1[7] = _mm256_unpackhi_epi64( m2[6], m2[7] );
2423
2424
0
    m2[0] = _mm256_permute2x128_si256( m1[0], m1[2], perm_unpacklo_epi128 );
2425
0
    m2[1] = _mm256_permute2x128_si256( m1[0], m1[2], perm_unpackhi_epi128 );
2426
0
    m2[2] = _mm256_permute2x128_si256( m1[1], m1[3], perm_unpacklo_epi128 );
2427
0
    m2[3] = _mm256_permute2x128_si256( m1[1], m1[3], perm_unpackhi_epi128 );
2428
0
    m2[4] = _mm256_permute2x128_si256( m1[4], m1[6], perm_unpacklo_epi128 );
2429
0
    m2[5] = _mm256_permute2x128_si256( m1[4], m1[6], perm_unpackhi_epi128 );
2430
0
    m2[6] = _mm256_permute2x128_si256( m1[5], m1[7], perm_unpacklo_epi128 );
2431
0
    m2[7] = _mm256_permute2x128_si256( m1[5], m1[7], perm_unpackhi_epi128 );
2432
2433
    // 2. 8x8
2434
0
    m2[0+8] = _mm256_unpacklo_epi32( m1[0+8], m1[1+8] );
2435
0
    m2[1+8] = _mm256_unpacklo_epi32( m1[2+8], m1[3+8] );
2436
0
    m2[2+8] = _mm256_unpacklo_epi32( m1[4+8], m1[5+8] );
2437
0
    m2[3+8] = _mm256_unpacklo_epi32( m1[6+8], m1[7+8] );
2438
0
    m2[4+8] = _mm256_unpackhi_epi32( m1[0+8], m1[1+8] );
2439
0
    m2[5+8] = _mm256_unpackhi_epi32( m1[2+8], m1[3+8] );
2440
0
    m2[6+8] = _mm256_unpackhi_epi32( m1[4+8], m1[5+8] );
2441
0
    m2[7+8] = _mm256_unpackhi_epi32( m1[6+8], m1[7+8] );
2442
2443
0
    m1[0+8] = _mm256_unpacklo_epi64( m2[0+8], m2[1+8] );
2444
0
    m1[1+8] = _mm256_unpackhi_epi64( m2[0+8], m2[1+8] );
2445
0
    m1[2+8] = _mm256_unpacklo_epi64( m2[2+8], m2[3+8] );
2446
0
    m1[3+8] = _mm256_unpackhi_epi64( m2[2+8], m2[3+8] );
2447
0
    m1[4+8] = _mm256_unpacklo_epi64( m2[4+8], m2[5+8] );
2448
0
    m1[5+8] = _mm256_unpackhi_epi64( m2[4+8], m2[5+8] );
2449
0
    m1[6+8] = _mm256_unpacklo_epi64( m2[6+8], m2[7+8] );
2450
0
    m1[7+8] = _mm256_unpackhi_epi64( m2[6+8], m2[7+8] );
2451
2452
0
    m2[0+8] = _mm256_permute2x128_si256( m1[0+8], m1[2+8], perm_unpacklo_epi128 );
2453
0
    m2[1+8] = _mm256_permute2x128_si256( m1[0+8], m1[2+8], perm_unpackhi_epi128 );
2454
0
    m2[2+8] = _mm256_permute2x128_si256( m1[1+8], m1[3+8], perm_unpacklo_epi128 );
2455
0
    m2[3+8] = _mm256_permute2x128_si256( m1[1+8], m1[3+8], perm_unpackhi_epi128 );
2456
0
    m2[4+8] = _mm256_permute2x128_si256( m1[4+8], m1[6+8], perm_unpacklo_epi128 );
2457
0
    m2[5+8] = _mm256_permute2x128_si256( m1[4+8], m1[6+8], perm_unpackhi_epi128 );
2458
0
    m2[6+8] = _mm256_permute2x128_si256( m1[5+8], m1[7+8], perm_unpacklo_epi128 );
2459
0
    m2[7+8] = _mm256_permute2x128_si256( m1[5+8], m1[7+8], perm_unpackhi_epi128 );
2460
    
2461
0
    #undef perm_unpacklo_epi128    
2462
0
    #undef perm_unpackhi_epi128
2463
2464
    // horizontal
2465
0
    m1[0] = _mm256_add_epi32( m2[0], m2[4] );
2466
0
    m1[1] = _mm256_add_epi32( m2[1], m2[5] );
2467
0
    m1[2] = _mm256_add_epi32( m2[2], m2[6] );
2468
0
    m1[3] = _mm256_add_epi32( m2[3], m2[7] );
2469
0
    m1[4] = _mm256_sub_epi32( m2[0], m2[4] );
2470
0
    m1[5] = _mm256_sub_epi32( m2[1], m2[5] );
2471
0
    m1[6] = _mm256_sub_epi32( m2[2], m2[6] );
2472
0
    m1[7] = _mm256_sub_epi32( m2[3], m2[7] );
2473
2474
0
    m2[0] = _mm256_add_epi32( m1[0], m1[2] );
2475
0
    m2[1] = _mm256_add_epi32( m1[1], m1[3] );
2476
0
    m2[2] = _mm256_sub_epi32( m1[0], m1[2] );
2477
0
    m2[3] = _mm256_sub_epi32( m1[1], m1[3] );
2478
0
    m2[4] = _mm256_add_epi32( m1[4], m1[6] );
2479
0
    m2[5] = _mm256_add_epi32( m1[5], m1[7] );
2480
0
    m2[6] = _mm256_sub_epi32( m1[4], m1[6] );
2481
0
    m2[7] = _mm256_sub_epi32( m1[5], m1[7] );
2482
2483
0
    m1[0] = _mm256_abs_epi32( _mm256_add_epi32( m2[0], m2[1] ) );
2484
0
    m1[1] = _mm256_abs_epi32( _mm256_sub_epi32( m2[0], m2[1] ) );
2485
0
    m1[2] = _mm256_abs_epi32( _mm256_add_epi32( m2[2], m2[3] ) );
2486
0
    m1[3] = _mm256_abs_epi32( _mm256_sub_epi32( m2[2], m2[3] ) );
2487
0
    m1[4] = _mm256_abs_epi32( _mm256_add_epi32( m2[4], m2[5] ) );
2488
0
    m1[5] = _mm256_abs_epi32( _mm256_sub_epi32( m2[4], m2[5] ) );
2489
0
    m1[6] = _mm256_abs_epi32( _mm256_add_epi32( m2[6], m2[7] ) );
2490
0
    m1[7] = _mm256_abs_epi32( _mm256_sub_epi32( m2[6], m2[7] ) );
2491
2492
0
    int absDc = _mm_cvtsi128_si32( _mm256_castsi256_si128( m1[0] ) );
2493
2494
0
    m1[0 + 8] = _mm256_add_epi32( m2[0 + 8], m2[4 + 8] );
2495
0
    m1[1 + 8] = _mm256_add_epi32( m2[1 + 8], m2[5 + 8] );
2496
0
    m1[2 + 8] = _mm256_add_epi32( m2[2 + 8], m2[6 + 8] );
2497
0
    m1[3 + 8] = _mm256_add_epi32( m2[3 + 8], m2[7 + 8] );
2498
0
    m1[4 + 8] = _mm256_sub_epi32( m2[0 + 8], m2[4 + 8] );
2499
0
    m1[5 + 8] = _mm256_sub_epi32( m2[1 + 8], m2[5 + 8] );
2500
0
    m1[6 + 8] = _mm256_sub_epi32( m2[2 + 8], m2[6 + 8] );
2501
0
    m1[7 + 8] = _mm256_sub_epi32( m2[3 + 8], m2[7 + 8] );
2502
2503
0
    m2[0 + 8] = _mm256_add_epi32( m1[0 + 8], m1[2 + 8] );
2504
0
    m2[1 + 8] = _mm256_add_epi32( m1[1 + 8], m1[3 + 8] );
2505
0
    m2[2 + 8] = _mm256_sub_epi32( m1[0 + 8], m1[2 + 8] );
2506
0
    m2[3 + 8] = _mm256_sub_epi32( m1[1 + 8], m1[3 + 8] );
2507
0
    m2[4 + 8] = _mm256_add_epi32( m1[4 + 8], m1[6 + 8] );
2508
0
    m2[5 + 8] = _mm256_add_epi32( m1[5 + 8], m1[7 + 8] );
2509
0
    m2[6 + 8] = _mm256_sub_epi32( m1[4 + 8], m1[6 + 8] );
2510
0
    m2[7 + 8] = _mm256_sub_epi32( m1[5 + 8], m1[7 + 8] );
2511
2512
0
    m1[0 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[0 + 8], m2[1 + 8] ) );
2513
0
    m1[1 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[0 + 8], m2[1 + 8] ) );
2514
0
    m1[2 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[2 + 8], m2[3 + 8] ) );
2515
0
    m1[3 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[2 + 8], m2[3 + 8] ) );
2516
0
    m1[4 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[4 + 8], m2[5 + 8] ) );
2517
0
    m1[5 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[4 + 8], m2[5 + 8] ) );
2518
0
    m1[6 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[6 + 8], m2[7 + 8] ) );
2519
0
    m1[7 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[6 + 8], m2[7 + 8] ) );
2520
2521
    // sum up
2522
0
    m1[0] = _mm256_add_epi32( m1[0], m1[1] );
2523
0
    m1[1] = _mm256_add_epi32( m1[2], m1[3] );
2524
0
    m1[2] = _mm256_add_epi32( m1[4], m1[5] );
2525
0
    m1[3] = _mm256_add_epi32( m1[6], m1[7] );
2526
0
    m1[4] = _mm256_add_epi32( m1[8], m1[9] );
2527
0
    m1[5] = _mm256_add_epi32( m1[10], m1[11] );
2528
0
    m1[6] = _mm256_add_epi32( m1[12], m1[13] );
2529
0
    m1[7] = _mm256_add_epi32( m1[14], m1[15] );
2530
2531
    // sum up
2532
0
    m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] );
2533
0
    m1[ 1] = _mm256_add_epi32( m1[ 2], m1[ 3] );
2534
0
    m1[ 2] = _mm256_add_epi32( m1[ 4], m1[ 5] );
2535
0
    m1[ 3] = _mm256_add_epi32( m1[ 6], m1[ 7] );
2536
2537
0
    m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] );
2538
0
    m1[ 1] = _mm256_add_epi32( m1[ 2], m1[ 3] );
2539
2540
0
    __m256i iSum = _mm256_add_epi32( m1[0], m1[1] );
2541
2542
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
2543
0
    iSum = _mm256_hadd_epi32( iSum, iSum );
2544
0
    iSum = _mm256_add_epi32( iSum, _mm256_permute2x128_si256( iSum, iSum, 0x11 ) );
2545
2546
0
    int sad2 = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) );
2547
0
    sad2 -= absDc;
2548
0
    sad2 += absDc >> 2;
2549
0
    sad = (uint32_t)(sad2 / sqrt(16.0 * 8) * 2);
2550
0
  }
2551
2552
0
#endif //USE_AVX2
2553
2554
0
  return (sad);
2555
0
}
Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD8x16_AVX2(short const*, short const*, int, int, int)
Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD8x16_AVX2(short const*, short const*, int, int, int)
2556
2557
template<X86_VEXT vext >
2558
Distortion RdCost::xGetHAD2SADs_SIMD( const DistParam &rcDtParam )
2559
0
{
2560
0
  Distortion distHad = xGetHADs_SIMD<vext, false>( rcDtParam );
2561
0
  Distortion distSad = 0;
2562
2563
0
  {
2564
0
    const short* pSrc1   = (const short*)rcDtParam.org.buf;
2565
0
    const short* pSrc2   = (const short*)rcDtParam.cur.buf;
2566
0
    const int iStrideSrc1 = rcDtParam.org.stride<<2;
2567
0
    const int iStrideSrc2 = rcDtParam.cur.stride<<2;
2568
0
    const int  iRows      = rcDtParam.org.height>>2;
2569
0
    const int  iCols      = rcDtParam.org.width<<2;
2570
2571
0
    uint32_t uiSum = 0;
2572
0
    CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering");
2573
2574
#ifdef USE_AVX2 
2575
0
    if( vext >= AVX2 )
2576
0
    {
2577
      // Do for width that multiple of 16
2578
0
      __m256i vone   = _mm256_set1_epi16( 1 );
2579
0
      __m256i vsum32 = _mm256_setzero_si256();
2580
0
      for( int iY = 0; iY < iRows; iY++ )
2581
0
      {
2582
0
        __m256i vsum16 = _mm256_setzero_si256();
2583
0
        for( int iX = 0; iX < iCols; iX+=16 )
2584
0
        {
2585
0
          __m256i vsrc1 = _mm256_load_si256( ( __m256i* )( &pSrc1[iX] ) );
2586
0
          __m256i vsrc2 = _mm256_load_si256( ( __m256i* )( &pSrc2[iX] ) );
2587
0
          vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
2588
0
        }
2589
0
        __m256i vsumtemp = _mm256_madd_epi16( vsum16, vone );
2590
0
        vsum32 = _mm256_add_epi32( vsum32, vsumtemp );
2591
0
        pSrc1   += iStrideSrc1;
2592
0
        pSrc2   += iStrideSrc2;
2593
0
      }
2594
0
      vsum32 = _mm256_hadd_epi32( vsum32, vone );
2595
0
      vsum32 = _mm256_hadd_epi32( vsum32, vone );
2596
0
      uiSum =  _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_extracti128_si256( vsum32, 1 ) );
2597
0
    }
2598
0
    else
2599
0
#endif
2600
0
    {
2601
      // For width that multiple of 8
2602
0
      __m128i vone = _mm_set1_epi16( 1 );
2603
0
      __m128i vsum32 = _mm_setzero_si128();
2604
0
      for( int iY = 0; iY < iRows; iY++ )
2605
0
      {
2606
0
        __m128i vsum16 = _mm_setzero_si128();
2607
0
        for( int iX = 0; iX < iCols; iX+=8 )
2608
0
        {
2609
0
          __m128i vsrc1 = _mm_load_si128( ( const __m128i* )( &pSrc1[iX] ) );
2610
0
          __m128i vsrc2 = _mm_load_si128( ( const __m128i* )( &pSrc2[iX] ) );
2611
0
          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
2612
0
        }
2613
0
        __m128i vsumtemp = _mm_madd_epi16( vsum16, vone );
2614
0
        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
2615
0
        pSrc1   += iStrideSrc1;
2616
0
        pSrc2   += iStrideSrc2;
2617
0
      }
2618
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
2619
0
      vsum32 = _mm_hadd_epi32( vsum32, vone );
2620
0
      uiSum =  _mm_cvtsi128_si32( vsum32 );
2621
0
    }
2622
0
    distSad = uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2623
0
  }
2624
2625
0
  return std::min( distHad, 2*distSad);
2626
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHAD2SADs_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHAD2SADs_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
2627
2628
template<X86_VEXT vext> 
2629
Distortion RdCost::xGetSADwMask_SIMD(const DistParam &rcDtParam)
2630
0
{
2631
0
  if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight)
2632
0
    return RdCost::xGetSADwMask(rcDtParam);
2633
2634
0
  const short *src1       = (const short *) rcDtParam.org.buf;
2635
0
  const short *src2       = (const short *) rcDtParam.cur.buf;
2636
0
  const short *weightMask = (const short *) rcDtParam.mask;
2637
0
  int          rows       = rcDtParam.org.height;
2638
0
  int          cols       = rcDtParam.org.width;
2639
0
  int          subShift   = rcDtParam.subShift;
2640
0
  int          subStep    = (1 << subShift);
2641
0
  const int    strideSrc1 = rcDtParam.org.stride * subStep;
2642
0
  const int    strideSrc2 = rcDtParam.cur.stride * subStep;
2643
0
  const int    strideMask = rcDtParam.maskStride * subStep;
2644
2645
0
  Distortion sum = 0;
2646
0
  if (vext >= AVX2 && (cols & 15) == 0)
2647
0
  {
2648
#if defined( USE_AVX2 )
2649
    // Do for width that multiple of 16
2650
    __m256i vzero  = _mm256_setzero_si256();
2651
    __m256i vsum32 = vzero;
2652
0
    for (int y = 0; y < rows; y += subStep)
2653
0
    {
2654
0
      for (int x = 0; x < cols; x += 16)
2655
0
      {
2656
0
        __m256i vsrc1 = _mm256_loadu_si256((__m256i *) (&src1[x]));
2657
0
        __m256i vsrc2 = _mm256_loadu_si256((__m256i *) (&src2[x]));
2658
0
        __m256i vmask;
2659
2660
0
        if (rcDtParam.stepX == -1)
2661
0
        {
2662
0
          vmask                      = _mm256_loadu_si256((__m256i *) ((&weightMask[x]) - (x << 1) - (16 - 1)));
2663
0
          const __m256i shuffle_mask = _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2,
2664
0
                                                       5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
2665
0
          vmask                      = _mm256_shuffle_epi8(vmask, shuffle_mask);
2666
0
          vmask                      = _mm256_permute4x64_epi64(vmask, _MM_SHUFFLE(1, 0, 3, 2));
2667
0
        }
2668
0
        else
2669
0
        {
2670
0
          vmask                      = _mm256_loadu_si256( ( __m256i * ) ( &weightMask[x] ) );
2671
0
        }
2672
2673
0
        vsum32 = _mm256_add_epi32(vsum32, _mm256_madd_epi16(vmask, _mm256_abs_epi16(_mm256_sub_epi16(vsrc1, vsrc2))));
2674
0
      }
2675
0
      src1       += strideSrc1;
2676
0
      src2       += strideSrc2;
2677
0
      weightMask += strideMask;
2678
0
    }
2679
    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
2680
    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
2681
    sum    = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32))
2682
           + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)));
2683
#endif
2684
0
  }
2685
0
  else
2686
0
  {
2687
    // Do with step of 8
2688
0
    __m128i vzero  = _mm_setzero_si128();
2689
0
    __m128i vsum32 = vzero;
2690
0
    for (int y = 0; y < rows; y += subStep)
2691
0
    {
2692
0
      for (int x = 0; x < cols; x += 8)
2693
0
      {
2694
0
        __m128i vsrc1 = _mm_loadu_si128((const __m128i *) (&src1[x]));
2695
0
        __m128i vsrc2 = _mm_loadu_si128((const __m128i *) (&src2[x]));
2696
0
        __m128i vmask;
2697
0
        if (rcDtParam.stepX == -1)
2698
0
        {
2699
0
          vmask                      = _mm_loadu_si128((__m128i *) ((&weightMask[x]) - (x << 1) - (8 - 1)));
2700
0
          const __m128i shuffle_mask = _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
2701
0
          vmask                      = _mm_shuffle_epi8(vmask, shuffle_mask);
2702
0
        }
2703
0
        else
2704
0
        {
2705
0
          vmask = _mm_loadu_si128((const __m128i *) (&weightMask[x]));
2706
0
        }
2707
0
        vsum32 = _mm_add_epi32(vsum32, _mm_madd_epi16(vmask, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2))));
2708
0
      }
2709
0
      src1 += strideSrc1;
2710
0
      src2 += strideSrc2;
2711
0
      weightMask += strideMask;
2712
0
    }
2713
0
    vsum32 = _mm_hadd_epi32(vsum32, vzero);
2714
0
    vsum32 = _mm_hadd_epi32(vsum32, vzero);
2715
0
    sum    = _mm_cvtsi128_si32(vsum32);
2716
0
  }
2717
0
  sum <<= subShift;
2718
0
  return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2719
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSADwMask_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSADwMask_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&)
2720
2721
template<X86_VEXT vext, bool fastHad>
2722
Distortion RdCost::xGetHADs_SIMD( const DistParam &rcDtParam )
2723
0
{
2724
0
  const Pel*  piOrg = rcDtParam.org.buf;
2725
0
  const Pel*  piCur = rcDtParam.cur.buf;
2726
0
  const int iRows = rcDtParam.org.height;
2727
0
  const int iCols = rcDtParam.org.width;
2728
0
  const int iStrideCur = rcDtParam.cur.stride;
2729
0
  const int iStrideOrg = rcDtParam.org.stride;
2730
0
  const int iBitDepth  = rcDtParam.bitDepth;
2731
2732
0
  int  x, y;
2733
0
  Distortion uiSum = 0;
2734
2735
0
  if( iCols > iRows && ( iCols & 15 ) == 0 && ( iRows & 7 ) == 0 )
2736
0
  {
2737
0
    for( y = 0; y < iRows; y += 8 )
2738
0
    {
2739
0
      for( x = 0; x < iCols; x += 16 )
2740
0
      {
2741
0
        if( vext >= AVX2 )
2742
0
          uiSum += xCalcHAD16x8_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2743
0
        else
2744
0
          uiSum += xCalcHAD16x8_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2745
0
      }
2746
0
      piOrg += 8*iStrideOrg;
2747
0
      piCur += 8*iStrideCur;
2748
0
    }
2749
0
  }
2750
0
  else if( iCols < iRows && ( iRows & 15 ) == 0 && ( iCols & 7 ) == 0 )
2751
0
  {
2752
0
    for( y = 0; y < iRows; y += 16 )
2753
0
    {
2754
0
      for( x = 0; x < iCols; x += 8 )
2755
0
      {
2756
0
        if( vext >= AVX2 )
2757
0
          uiSum += xCalcHAD8x16_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2758
0
        else
2759
0
          uiSum += xCalcHAD8x16_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2760
0
      }
2761
0
      piOrg += 16*iStrideOrg;
2762
0
      piCur += 16*iStrideCur;
2763
0
    }
2764
0
  }
2765
0
  else if( iCols > iRows && ( iCols & 7 ) == 0 && ( iRows & 3 ) == 0 )
2766
0
  {
2767
0
    for( y = 0; y < iRows; y += 4 )
2768
0
    {
2769
0
      for( x = 0; x < iCols; x += 8 )
2770
0
      {
2771
0
        uiSum += xCalcHAD8x4_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2772
0
      }
2773
0
      piOrg += 4*iStrideOrg;
2774
0
      piCur += 4*iStrideCur;
2775
0
    }
2776
0
  }
2777
0
  else if( iCols < iRows && ( iRows & 7 ) == 0 && ( iCols & 3 ) == 0 )
2778
0
  {
2779
0
    for( y = 0; y < iRows; y += 8 )
2780
0
    {
2781
0
      for( x = 0; x < iCols; x += 4 )
2782
0
      {
2783
0
        uiSum += xCalcHAD4x8_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2784
0
      }
2785
0
      piOrg += 8*iStrideOrg;
2786
0
      piCur += 8*iStrideCur;
2787
0
    }
2788
0
  }
2789
0
  else if( fastHad && vext >= AVX2 && ( ( ( iRows | iCols ) & 31 ) == 0 ) && ( iRows == iCols ) )
2790
0
  {
2791
0
    for( y = 0; y < iRows; y += 32 )
2792
0
    {
2793
0
      for( x = 0; x < iCols; x += 32 )
2794
0
      {
2795
0
        uiSum += xCalcHAD32x32_fast_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2796
0
      }
2797
0
      piOrg += 32 * iStrideOrg;
2798
0
      piCur += 32 * iStrideCur;
2799
0
    }
2800
0
  }
2801
0
  else if( fastHad && ( ( ( iRows | iCols ) & 31 ) == 0 ) && ( iRows == iCols ) )
2802
0
  {
2803
0
    for( y = 0; y < iRows; y += 16 )
2804
0
    {
2805
0
      for( x = 0; x < iCols; x += 16 )
2806
0
      {
2807
0
        uiSum += xCalcHAD16x16_fast_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2808
0
      }
2809
0
      piOrg += 16 * iStrideOrg;
2810
0
      piCur += 16 * iStrideCur;
2811
0
    }
2812
0
  }
2813
0
  else if( vext >= AVX2 && ( ( ( iRows | iCols ) & 15 ) == 0 ) && ( iRows == iCols ) )
2814
0
  {
2815
0
    for( y = 0; y < iRows; y += 16 )
2816
0
    {
2817
0
      for( x = 0; x < iCols; x += 16 )
2818
0
      {
2819
0
        uiSum += xCalcHAD16x16_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2820
0
      }
2821
0
      piOrg += 16*iStrideOrg;
2822
0
      piCur += 16*iStrideCur;
2823
0
    }
2824
0
  }
2825
0
  else if( ( ( ( iRows | iCols ) & 7 ) == 0 ) && ( iRows == iCols ) )
2826
0
  {
2827
0
    for( y = 0; y<iRows; y += 8 )
2828
0
    {
2829
0
      for( x = 0; x < iCols; x += 8 )
2830
0
      {
2831
0
        uiSum += xCalcHAD8x8_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth );
2832
0
      }
2833
0
      piOrg += 8*iStrideOrg;
2834
0
      piCur += 8*iStrideCur;
2835
0
    }
2836
0
  }
2837
0
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
2838
0
  {
2839
0
    for( y = 0; y < iRows; y += 4 )
2840
0
    {
2841
0
      for( x = 0; x < iCols; x += 4 )
2842
0
      {
2843
0
        uiSum += xCalcHAD4x4_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
2844
0
      }
2845
0
      piOrg += 4*iStrideOrg;
2846
0
      piCur += 4*iStrideCur;
2847
0
    }
2848
0
  }
2849
0
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
2850
0
  {
2851
0
    for( y = 0; y < iRows; y += 2 )
2852
0
    {
2853
0
      for( x = 0; x < iCols; x += 2 )
2854
0
      {
2855
0
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
2856
0
      }
2857
0
      piOrg += 2*iStrideOrg;
2858
0
      piCur += 2*iStrideCur;
2859
0
    }
2860
0
  }
2861
0
  else
2862
0
  {
2863
0
    THROW( "Unsupported size" );
2864
0
  }
2865
2866
0
  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2867
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)1, false>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)1, true>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)4, false>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)4, true>(vvenc::DistParam const&)
2868
2869
inline Distortion getWeightedMSE_SIMD(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift)
2870
0
{
2871
0
  const Intermediate_Int iTemp = org - cur;
2872
0
  return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift);
2873
0
}
2874
2875
template<X86_VEXT vext, int csx>
2876
static Distortion lumaWeightedSSE_SIMD( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights )
2877
0
{
2878
0
        int  iRows = rcDtParam.org.height;
2879
0
  const Pel* piOrg = rcDtParam.org.buf;
2880
0
  const Pel* piCur = rcDtParam.cur.buf;
2881
0
  const int  iCols = rcDtParam.org.width;
2882
0
  const int  iStrideCur = rcDtParam.cur.stride;
2883
0
  const int  iStrideOrg = rcDtParam.org.stride;
2884
0
  const Pel* piOrgLuma        = rcDtParam.orgLuma->buf;
2885
0
  const int  iStrideOrgLuma   = rcDtParam.orgLuma->stride;
2886
2887
0
  Distortion uiSum   = 0;
2888
0
  const uint32_t uiShift   = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
2889
0
  const __m128i vShift = _mm_cvtsi32_si128(uiShift);
2890
2891
0
  const ComponentID compId = rcDtParam.compID;
2892
0
  const size_t  cShiftY    = getComponentScaleY(compId, chmFmt);
2893
  
2894
0
  if( ( iCols & 7 ) == 0 )
2895
0
  {
2896
0
    const __m128i xoffs = _mm_set1_epi64x( 1 << 15 );
2897
0
          __m128i xsum  = _mm_setzero_si128();
2898
2899
0
    for( ; iRows != 0; iRows-- )
2900
0
    {
2901
0
      for (int n = 0; n < iCols; n += 8 )
2902
0
      {
2903
0
        const int o = n<<csx;
2904
        
2905
0
        __m128i xorg = _mm_loadu_si128( ( const __m128i* ) &piOrg[n] );
2906
0
        __m128i xcur = _mm_loadu_si128( ( const __m128i* ) &piCur[n] );
2907
        
2908
0
        xcur = _mm_sub_epi16     ( xorg, xcur );
2909
2910
0
        const __m128i
2911
0
        xmlo = _mm_mullo_epi16   ( xcur, xcur ),
2912
0
        xmhi = _mm_mulhi_epi16   ( xcur, xcur );
2913
2914
0
        __m128i
2915
0
        xwgt = _mm_setr_epi32    ( lumaWeights[piOrgLuma[o+(0<<csx)]],
2916
0
                                   lumaWeights[piOrgLuma[o+(1<<csx)]],
2917
0
                                   lumaWeights[piOrgLuma[o+(2<<csx)]],
2918
0
                                   lumaWeights[piOrgLuma[o+(3<<csx)]] );
2919
        
2920
0
        __m128i
2921
0
        xmul = _mm_unpacklo_epi16( xmlo, xmhi );
2922
0
        __m128i
2923
0
        xtmp = _mm_mul_epi32     ( xmul, xwgt );
2924
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
2925
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
2926
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
2927
2928
0
        xwgt = _mm_shuffle_epi32 ( xwgt, 1 + 0 + 48 + 128 );
2929
0
        xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 );
2930
0
        xtmp = _mm_mul_epi32     ( xmul, xwgt );
2931
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
2932
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
2933
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
2934
        
2935
0
        xwgt = _mm_setr_epi32    ( lumaWeights[piOrgLuma[o+(4<<csx)]],
2936
0
                                   lumaWeights[piOrgLuma[o+(5<<csx)]],
2937
0
                                   lumaWeights[piOrgLuma[o+(6<<csx)]],
2938
0
                                   lumaWeights[piOrgLuma[o+(7<<csx)]] );
2939
2940
0
        xmul = _mm_unpackhi_epi16( xmlo, xmhi );
2941
0
        xtmp = _mm_mul_epi32     ( xmul, xwgt );
2942
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
2943
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
2944
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
2945
2946
0
        xwgt = _mm_shuffle_epi32 ( xwgt, 1 + 0 + 48 + 128 );
2947
0
        xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 );
2948
0
        xtmp = _mm_mul_epi32     ( xmul, xwgt );
2949
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
2950
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
2951
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
2952
2953
        //uiSum += getWeightedMSE_SIMD( piOrg[n  ], piCur[n  ], lumaWeights[piOrgLuma[(n  )<<csx]], uiShift );
2954
        //uiSum += getWeightedMSE_SIMD( piOrg[n+1], piCur[n+1], lumaWeights[piOrgLuma[(n+1)<<csx]], uiShift );
2955
0
      }
2956
2957
0
      piOrg     += iStrideOrg;
2958
0
      piCur     += iStrideCur;
2959
0
      piOrgLuma += iStrideOrgLuma<<cShiftY;
2960
0
    }
2961
2962
0
    uiSum += _mm_extract_epi64( xsum, 0 );
2963
0
    uiSum += _mm_extract_epi64( xsum, 1 );
2964
2965
0
    return uiSum;
2966
0
  }
2967
0
  else
2968
0
  if( ( iCols & 3 ) == 0 )
2969
0
  {
2970
0
    const __m128i xoffs = _mm_set1_epi64x( 1 << 15 );
2971
0
          __m128i xsum  = _mm_setzero_si128();
2972
2973
0
    for( ; iRows != 0; iRows-- )
2974
0
    {
2975
0
      for (int n = 0; n < iCols; n += 4 )
2976
0
      {
2977
0
        const int o = n<<csx;
2978
        
2979
0
        __m128i xorg = _vv_loadl_epi64( ( const __m128i* ) &piOrg[n] );
2980
0
        __m128i xcur = _vv_loadl_epi64( ( const __m128i* ) &piCur[n] );
2981
        
2982
0
        xcur = _mm_sub_epi16     ( xorg, xcur );
2983
2984
0
        const __m128i
2985
0
        xmlo = _mm_mullo_epi16   ( xcur, xcur ),
2986
0
        xmhi = _mm_mulhi_epi16   ( xcur, xcur );
2987
2988
0
        __m128i
2989
0
        xwgt = _mm_setr_epi32    ( lumaWeights[piOrgLuma[o+(0<<csx)]],
2990
0
                                   lumaWeights[piOrgLuma[o+(1<<csx)]],
2991
0
                                   lumaWeights[piOrgLuma[o+(2<<csx)]],
2992
0
                                   lumaWeights[piOrgLuma[o+(3<<csx)]] );
2993
        
2994
0
        __m128i
2995
0
        xmul = _mm_unpacklo_epi16( xmlo, xmhi );
2996
0
        __m128i
2997
0
        xtmp = _mm_mul_epi32     ( xmul, xwgt );
2998
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
2999
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
3000
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
3001
3002
0
        xwgt = _mm_shuffle_epi32 ( xwgt, 1 + 0 + 48 + 128 );
3003
0
        xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 );
3004
0
        xtmp = _mm_mul_epi32     ( xmul, xwgt );
3005
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
3006
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
3007
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
3008
3009
        //uiSum += getWeightedMSE_SIMD( piOrg[n  ], piCur[n  ], lumaWeights[piOrgLuma[(n  )<<csx]], uiShift );
3010
        //uiSum += getWeightedMSE_SIMD( piOrg[n+1], piCur[n+1], lumaWeights[piOrgLuma[(n+1)<<csx]], uiShift );
3011
0
      }
3012
3013
0
      piOrg     += iStrideOrg;
3014
0
      piCur     += iStrideCur;
3015
0
      piOrgLuma += iStrideOrgLuma<<cShiftY;
3016
0
    }
3017
3018
0
    uiSum += _mm_extract_epi64( xsum, 0 );
3019
0
    uiSum += _mm_extract_epi64( xsum, 1 );
3020
3021
0
    return uiSum;
3022
0
  }
3023
0
  else
3024
0
  if( ( iCols & 1 ) == 0 )
3025
0
  {
3026
0
    for( ; iRows != 0; iRows-- )
3027
0
    {
3028
0
      for (int n = 0; n < iCols; n+=2 )
3029
0
      {
3030
0
        uiSum += getWeightedMSE_SIMD( piOrg[n  ], piCur[n  ], lumaWeights[piOrgLuma[(n  )<<csx]], uiShift );
3031
0
        uiSum += getWeightedMSE_SIMD( piOrg[n+1], piCur[n+1], lumaWeights[piOrgLuma[(n+1)<<csx]], uiShift );
3032
0
      }
3033
3034
0
      piOrg     += iStrideOrg;
3035
0
      piCur     += iStrideCur;
3036
0
      piOrgLuma += iStrideOrgLuma<<cShiftY;
3037
0
    }
3038
3039
0
    return uiSum;
3040
0
  }
3041
0
  else
3042
0
  {
3043
0
    for( ; iRows != 0; iRows-- )
3044
0
    {
3045
0
      for (int n = 0; n < iCols; n++ )
3046
0
      {
3047
0
        uiSum += getWeightedMSE_SIMD( piOrg[n   ], piCur[n   ], lumaWeights[piOrgLuma[(n   )<<csx]], uiShift );
3048
0
      }
3049
3050
0
      piOrg     += iStrideOrg;
3051
0
      piCur     += iStrideCur;
3052
0
      piOrgLuma += iStrideOrgLuma<<cShiftY;
3053
0
    }
3054
3055
0
    return uiSum;
3056
0
  }
3057
3058
0
  return 0;
3059
0
}
Unexecuted instantiation: RdCost_sse41.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1, 0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost_sse41.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1, 1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost_avx2.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4, 0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost_avx2.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4, 1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
3060
3061
template<X86_VEXT vext>
3062
static Distortion fixWeightedSSE_SIMD( const DistParam& rcDtParam, uint32_t fixedPTweight )
3063
0
{
3064
0
        int  iRows = rcDtParam.org.height;
3065
0
  const Pel* piOrg = rcDtParam.org.buf;
3066
0
  const Pel* piCur = rcDtParam.cur.buf;
3067
0
  const int  iCols = rcDtParam.org.width;
3068
0
  const int  iStrideCur = rcDtParam.cur.stride;
3069
0
  const int  iStrideOrg = rcDtParam.org.stride;
3070
3071
0
  Distortion uiSum       = 0;
3072
0
  const uint32_t uiShift = 16 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) << 1 );
3073
0
  const __m128i vShift   = _mm_cvtsi32_si128(uiShift);
3074
3075
0
  if( ( iCols & 3 ) == 0 )
3076
0
  {
3077
0
    const __m128i xfxdw = _mm_set1_epi32 ( fixedPTweight );
3078
0
    const __m128i xoffs = _mm_set1_epi64x( 1 << 15 );
3079
0
          __m128i xsum  = _mm_setzero_si128();
3080
3081
0
    for( ; iRows != 0; iRows-- )
3082
0
    {
3083
0
      for( int n = 0; n < iCols; n += 4 )
3084
0
      {
3085
0
        __m128i xorg = _vv_loadl_epi64( ( const __m128i* ) &piOrg[n] );
3086
0
        __m128i xcur = _vv_loadl_epi64( ( const __m128i* ) &piCur[n] );
3087
3088
0
        xcur = _mm_sub_epi16     ( xorg, xcur );
3089
3090
0
        const __m128i
3091
0
        xmlo = _mm_mullo_epi16   ( xcur, xcur ),
3092
0
        xmhi = _mm_mulhi_epi16   ( xcur, xcur );
3093
3094
0
        __m128i
3095
0
        xmul = _mm_unpacklo_epi16( xmlo, xmhi );
3096
0
        __m128i
3097
0
        xtmp = _mm_mul_epi32     ( xmul, xfxdw );
3098
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
3099
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
3100
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
3101
3102
0
        xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 );
3103
0
        xtmp = _mm_mul_epi32     ( xmul, xfxdw );
3104
0
        xtmp = _mm_add_epi64     ( xtmp, xoffs );
3105
0
        xtmp = _mm_srl_epi64     ( xtmp, vShift );
3106
0
        xsum = _mm_add_epi64     ( xsum, xtmp );
3107
0
      }
3108
0
      piOrg += iStrideOrg;
3109
0
      piCur += iStrideCur;
3110
0
    }
3111
3112
0
    uiSum += _mm_extract_epi64( xsum, 0 );
3113
0
    uiSum += _mm_extract_epi64( xsum, 1 );
3114
3115
0
    return uiSum;
3116
0
  }
3117
0
  else if( iCols == 2 )
3118
0
  {
3119
0
    for( ; iRows != 0; iRows-- )
3120
0
    {
3121
0
      for( int n = 0; n < iCols; n += 2 )
3122
0
      {
3123
0
        uiSum += getWeightedMSE_SIMD( piOrg[n    ], piCur[n    ], fixedPTweight, uiShift );
3124
0
        uiSum += getWeightedMSE_SIMD( piOrg[n + 1], piCur[n + 1], fixedPTweight, uiShift );
3125
0
      }
3126
0
      piOrg += iStrideOrg;
3127
0
      piCur += iStrideCur;
3128
0
    }
3129
3130
0
    return uiSum;
3131
0
  }
3132
0
  else
3133
0
  {
3134
0
    for( ; iRows != 0; iRows-- )
3135
0
    {
3136
0
      for( int n = 0; n < iCols; n++ )
3137
0
      {
3138
0
        uiSum += getWeightedMSE_SIMD( piOrg[n], piCur[n], fixedPTweight, uiShift );
3139
0
      }
3140
0
      piOrg += iStrideOrg;
3141
0
      piCur += iStrideCur;
3142
0
    }
3143
3144
0
    return uiSum;
3145
0
  }
3146
3147
0
  return 0;
3148
0
}
Unexecuted instantiation: RdCost_sse41.cpp:unsigned long vvenc::fixWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&, unsigned int)
Unexecuted instantiation: RdCost_avx2.cpp:unsigned long vvenc::fixWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&, unsigned int)
3149
3150
3151
template <X86_VEXT vext, bool isCalCentrePos>
3152
0
void xGetSADX5_8xN_SIMDImp(const DistParam& rcDtParam, Distortion* cost) {
3153
0
  int i;
3154
0
  const Pel* piOrg = rcDtParam.org.buf;
3155
0
  const Pel* piCur = rcDtParam.cur.buf - 4;
3156
0
  int height = rcDtParam.org.height;
3157
0
  int iSubShift = rcDtParam.subShift;
3158
0
  int iSubStep = (1 << iSubShift);
3159
0
  ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep;
3160
0
  ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep;
3161
3162
0
  __m128i sum0 = _mm_setzero_si128();
3163
0
  __m128i sum1 = _mm_setzero_si128();
3164
0
  __m128i sum2 = _mm_setzero_si128();
3165
0
  __m128i sum3 = _mm_setzero_si128();
3166
0
  __m128i sum4 = _mm_setzero_si128();
3167
3168
0
  __m128i vone = _mm_set1_epi16(1);
3169
0
  for (i = 0; i < height; i += iSubStep) {
3170
0
    __m128i s0 = _mm_loadu_si128((__m128i*)piOrg);
3171
0
    __m128i s1 = _mm_loadu_si128((__m128i*)piCur);
3172
0
    __m128i s2 = _vv_loadl_epi64((__m128i*)(piOrg + 8));
3173
0
    __m128i s3 = _vv_loadl_epi64((__m128i*)(piCur + 8));
3174
3175
0
    __m128i org0, org1, org2, org3, org4;
3176
0
    org0 = s0;
3177
0
    org1 = _mm_alignr_epi8(s2, s0, 2);
3178
0
    if (isCalCentrePos) org2 = _mm_alignr_epi8(s2, s0, 4);
3179
0
    org3 = _mm_alignr_epi8(s2, s0, 6);
3180
0
    org4 = _mm_alignr_epi8(s2, s0, 8);
3181
3182
0
    __m128i cur0, cur1, cur2, cur3, cur4;
3183
0
    cur4 = s1;
3184
0
    cur0 = _mm_alignr_epi8(s3, s1, 8);
3185
0
    cur1 = _mm_alignr_epi8(s3, s1, 6);
3186
0
    if (isCalCentrePos) cur2 = _mm_alignr_epi8(s3, s1, 4);
3187
0
    cur3 = _mm_alignr_epi8(s3, s1, 2);
3188
3189
0
    __m128i diff0, diff1, diff2, diff3, diff4;
3190
0
    diff0 = _mm_sub_epi16(org0, cur0);
3191
0
    diff1 = _mm_sub_epi16(org1, cur1);
3192
0
    if (isCalCentrePos) diff2 = _mm_sub_epi16(org2, cur2);
3193
0
    diff3 = _mm_sub_epi16(org3, cur3);
3194
0
    diff4 = _mm_sub_epi16(org4, cur4);
3195
3196
0
    diff0 = _mm_abs_epi16(diff0);
3197
0
    diff1 = _mm_abs_epi16(diff1);
3198
0
    if (isCalCentrePos) diff2 = _mm_abs_epi16(diff2);
3199
0
    diff3 = _mm_abs_epi16(diff3);
3200
0
    diff4 = _mm_abs_epi16(diff4);
3201
3202
0
    sum0 = _mm_add_epi16(sum0, diff0);
3203
0
    sum1 = _mm_add_epi16(sum1, diff1);
3204
0
    if (isCalCentrePos) sum2 = _mm_add_epi32(sum2, diff2);
3205
0
    sum3 = _mm_add_epi16(sum3, diff3);
3206
0
    sum4 = _mm_add_epi16(sum4, diff4);
3207
3208
0
    piOrg += iStrideOrg;
3209
0
    piCur += iStrideCur;
3210
0
  }
3211
3212
0
  sum0 = _mm_madd_epi16( sum0, vone );
3213
0
  sum1 = _mm_madd_epi16( sum1, vone );
3214
0
  if( isCalCentrePos ) sum2 = _mm_madd_epi16( sum2, vone );
3215
0
  sum3 = _mm_madd_epi16( sum3, vone );
3216
0
  sum4 = _mm_madd_epi16( sum4, vone );
3217
3218
0
  sum0 = _mm_hadd_epi32(sum0, sum1);
3219
0
  sum3 = _mm_hadd_epi32(sum3, sum4);
3220
0
  if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
3221
3222
0
  sum0 = _mm_hadd_epi32(sum0, sum3);
3223
0
  if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
3224
3225
0
  const __m128i vSubShift = _mm_cvtsi32_si128(iSubShift);
3226
0
  sum0 = _mm_sll_epi32(sum0, vSubShift);
3227
0
  if (isCalCentrePos) sum2 = _mm_sll_epi32(sum2, vSubShift);
3228
3229
0
  sum0 = _mm_srl_epi32(sum0, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth))));
3230
0
  if (isCalCentrePos) sum2 = _mm_srl_epi32(sum2, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth))));
3231
3232
0
  _mm_storeu_si128( ( __m128i* ) &cost[0], _mm_unpacklo_epi32( sum0, _mm_setzero_si128() ) );
3233
0
  if (isCalCentrePos) cost[2] = (_mm_cvtsi128_si32(sum2));
3234
0
  _mm_storeu_si128( ( __m128i* ) &cost[3], _mm_unpackhi_epi32( sum0, _mm_setzero_si128() ) );
3235
0
}
Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)1, true>(vvenc::DistParam const&, unsigned long*)
Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)1, false>(vvenc::DistParam const&, unsigned long*)
Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)4, true>(vvenc::DistParam const&, unsigned long*)
Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)4, false>(vvenc::DistParam const&, unsigned long*)
3236
3237
template <X86_VEXT vext>
3238
0
void RdCost::xGetSADX5_8xN_SIMD(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
3239
0
  if( rcDtParam.bitDepth > 10 ){
3240
0
    RdCost::xGetSAD8X5( rcDtParam, cost, isCalCentrePos );
3241
0
    return;
3242
0
  }
3243
  
3244
0
  if (isCalCentrePos)
3245
0
    xGetSADX5_8xN_SIMDImp<vext, true>(rcDtParam, cost);
3246
0
  else
3247
0
    xGetSADX5_8xN_SIMDImp<vext, false>(rcDtParam, cost);
3248
0
}
Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_8xN_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&, unsigned long*, bool)
Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_8xN_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&, unsigned long*, bool)
3249
3250
template <X86_VEXT vext, bool isCalCentrePos>
3251
0
void xGetSADX5_16xN_SIMDImp_X86(const DistParam& rcDtParam, Distortion* cost) {
3252
0
  int i, j;
3253
0
  const Pel* piOrg = rcDtParam.org.buf;
3254
0
  const Pel* piCur = rcDtParam.cur.buf - 4;
3255
0
  int height = rcDtParam.org.height;
3256
0
  int iSubShift = rcDtParam.subShift;
3257
0
  int iSubStep = (1 << iSubShift);
3258
0
  ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep;
3259
0
  ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep;
3260
3261
#  ifdef USE_AVX2
3262
0
  if (vext >= AVX2) {
3263
    // sum of 8 unsigned 10-bit ints (0-1023) can maximally be 3 + 10 bits, i.e. fits into 16 bit
3264
3265
    __m256i sum0 = _mm256_setzero_si256();
3266
    __m256i sum1 = _mm256_setzero_si256();
3267
    __m256i sum2 = _mm256_setzero_si256();
3268
    __m256i sum3 = _mm256_setzero_si256();
3269
    __m256i sum4 = _mm256_setzero_si256();
3270
3271
    __m256i vone = _mm256_set1_epi16(1);
3272
3273
0
    for (int i = 0; i < ( height >> 3 ); i++) {
3274
0
      __m256i s0 = _mm256_loadu_si256((__m256i*)piOrg);
3275
0
      __m256i s1 = _mm256_loadu_si256((__m256i*)piCur);
3276
0
      __m256i s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16)));
3277
0
      __m256i s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16)));
3278
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
3279
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
3280
3281
0
      piOrg += iStrideOrg;
3282
0
      piCur += iStrideCur;
3283
3284
0
      __m256i org0, org1, org2, org3, org4;
3285
0
      org0 = s0;
3286
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
3287
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
3288
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
3289
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
3290
3291
0
      __m256i cur0, cur1, cur2, cur3, cur4;
3292
0
      cur4 = s1;
3293
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
3294
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
3295
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
3296
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
3297
3298
0
      __m256i diff0, diff1, diff2, diff3, diff4;
3299
0
      diff0 = _mm256_sub_epi16(org0, cur0);
3300
0
      diff1 = _mm256_sub_epi16(org1, cur1);
3301
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
3302
0
      diff3 = _mm256_sub_epi16(org3, cur3);
3303
0
      diff4 = _mm256_sub_epi16(org4, cur4);
3304
3305
0
      diff0 = _mm256_abs_epi16( diff0 );
3306
0
      diff1 = _mm256_abs_epi16( diff1 );
3307
0
      if( isCalCentrePos ) diff2 = _mm256_abs_epi16( diff2 );
3308
0
      diff3 = _mm256_abs_epi16( diff3 );
3309
0
      diff4 = _mm256_abs_epi16( diff4 );
3310
3311
0
      sum0 = _mm256_add_epi16( diff0, sum0 );
3312
0
      sum1 = _mm256_add_epi16( diff1, sum1 );
3313
0
      if( isCalCentrePos ) sum2 = _mm256_add_epi16( diff2, sum2 );
3314
0
      sum3 = _mm256_add_epi16( diff3, sum3 );
3315
0
      sum4 = _mm256_add_epi16( diff4, sum4 );
3316
3317
0
      s0 = _mm256_loadu_si256((__m256i*)piOrg);
3318
0
      s1 = _mm256_loadu_si256((__m256i*)piCur);
3319
0
      s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16)));
3320
0
      s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16)));
3321
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
3322
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
3323
3324
0
      piOrg += iStrideOrg;
3325
0
      piCur += iStrideCur;
3326
3327
0
      org0 = s0;
3328
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
3329
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
3330
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
3331
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
3332
3333
0
      cur4 = s1;
3334
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
3335
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
3336
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
3337
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
3338
3339
0
      diff0 = _mm256_sub_epi16(org0, cur0);
3340
0
      diff1 = _mm256_sub_epi16(org1, cur1);
3341
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
3342
0
      diff3 = _mm256_sub_epi16(org3, cur3);
3343
0
      diff4 = _mm256_sub_epi16(org4, cur4);
3344
3345
0
      diff0 = _mm256_abs_epi16(diff0);
3346
0
      diff1 = _mm256_abs_epi16(diff1);
3347
0
      if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2);
3348
0
      diff3 = _mm256_abs_epi16(diff3);
3349
0
      diff4 = _mm256_abs_epi16(diff4);
3350
3351
0
      sum0 = _mm256_add_epi16(diff0, sum0);
3352
0
      sum1 = _mm256_add_epi16(diff1, sum1);
3353
0
      if (isCalCentrePos) sum2 = _mm256_add_epi16(diff2, sum2);
3354
0
      sum3 = _mm256_add_epi16(diff3, sum3);
3355
0
      sum4 = _mm256_add_epi16(diff4, sum4);
3356
3357
0
      s0 = _mm256_loadu_si256((__m256i*)piOrg);
3358
0
      s1 = _mm256_loadu_si256((__m256i*)piCur);
3359
0
      s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16)));
3360
0
      s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16)));
3361
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
3362
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
3363
3364
0
      piOrg += iStrideOrg;
3365
0
      piCur += iStrideCur;
3366
3367
0
      org0 = s0;
3368
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
3369
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
3370
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
3371
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
3372
3373
0
      cur4 = s1;
3374
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
3375
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
3376
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
3377
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
3378
3379
0
      diff0 = _mm256_sub_epi16(org0, cur0);
3380
0
      diff1 = _mm256_sub_epi16(org1, cur1);
3381
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
3382
0
      diff3 = _mm256_sub_epi16(org3, cur3);
3383
0
      diff4 = _mm256_sub_epi16(org4, cur4);
3384
3385
0
      diff0 = _mm256_abs_epi16(diff0);
3386
0
      diff1 = _mm256_abs_epi16(diff1);
3387
0
      if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2);
3388
0
      diff3 = _mm256_abs_epi16(diff3);
3389
0
      diff4 = _mm256_abs_epi16(diff4);
3390
3391
0
      sum0 = _mm256_add_epi16( diff0, sum0 );
3392
0
      sum1 = _mm256_add_epi16( diff1, sum1 );
3393
0
      if( isCalCentrePos ) sum2 = _mm256_add_epi16( diff2, sum2 );
3394
0
      sum3 = _mm256_add_epi16( diff3, sum3 );
3395
0
      sum4 = _mm256_add_epi16( diff4, sum4 );
3396
3397
0
      s0 = _mm256_loadu_si256((__m256i*)piOrg);
3398
0
      s1 = _mm256_loadu_si256((__m256i*)piCur);
3399
0
      s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16)));
3400
0
      s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16)));
3401
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
3402
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
3403
3404
0
      piOrg += iStrideOrg;
3405
0
      piCur += iStrideCur;
3406
3407
0
      org0 = s0;
3408
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
3409
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
3410
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
3411
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
3412
3413
0
      cur4 = s1;
3414
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
3415
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
3416
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
3417
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
3418
3419
0
      diff0 = _mm256_sub_epi16(org0, cur0);
3420
0
      diff1 = _mm256_sub_epi16(org1, cur1);
3421
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
3422
0
      diff3 = _mm256_sub_epi16(org3, cur3);
3423
0
      diff4 = _mm256_sub_epi16(org4, cur4);
3424
3425
0
      diff0 = _mm256_abs_epi16(diff0);
3426
0
      diff1 = _mm256_abs_epi16(diff1);
3427
0
      if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2);
3428
0
      diff3 = _mm256_abs_epi16(diff3);
3429
0
      diff4 = _mm256_abs_epi16(diff4);
3430
3431
0
      sum0 = _mm256_add_epi16(diff0, sum0);
3432
0
      sum1 = _mm256_add_epi16(diff1, sum1);
3433
0
      if (isCalCentrePos) sum2 = _mm256_add_epi16(diff2, sum2);
3434
0
      sum3 = _mm256_add_epi16(diff3, sum3);
3435
0
      sum4 = _mm256_add_epi16(diff4, sum4);
3436
0
    }
3437
3438
    sum0 = _mm256_madd_epi16( sum0, vone );
3439
    sum1 = _mm256_madd_epi16( sum1, vone );
3440
0
    if( isCalCentrePos ) sum2 = _mm256_madd_epi16( sum2, vone );
3441
    sum3 = _mm256_madd_epi16( sum3, vone );
3442
    sum4 = _mm256_madd_epi16( sum4, vone );
3443
3444
    sum0 = _mm256_hadd_epi32(sum0, sum1);
3445
    sum3 = _mm256_hadd_epi32(sum3, sum4);
3446
0
    if (isCalCentrePos) sum2 = _mm256_hadd_epi32(sum2, sum2);
3447
3448
    sum0 = _mm256_hadd_epi32(sum0, sum3);
3449
0
    if (isCalCentrePos) sum2 = _mm256_hadd_epi32(sum2, sum2);
3450
3451
    __m128i sum0134 = _mm_add_epi32(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1));
3452
3453
    sum0134 = _mm_sll_epi32(sum0134, _mm_cvtsi32_si128(iSubShift));
3454
3455
0
    sum0134 = _mm_srl_epi32(sum0134, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth))));
3456
3457
0
    _mm_storeu_si128( ( __m128i* ) &cost[0], _mm_unpacklo_epi32( sum0134, _mm_setzero_si128() ) );
3458
0
    if (isCalCentrePos) {
3459
0
      int tmp = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum2)) + _mm256_extract_epi32(sum2, 4);
3460
0
      tmp <<= iSubShift;
3461
0
      tmp >>= (1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)));
3462
0
      cost[2] = tmp;
3463
0
    }
3464
0
    _mm_storeu_si128( ( __m128i* ) &cost[3], _mm_unpackhi_epi32( sum0134, _mm_setzero_si128() ) );
3465
0
  }
3466
0
  else
3467
0
#  endif
3468
0
  {
3469
    // sum of 16 unsigned 10-bit ints (0-1023) can maximally be 4 + 10 bits, i.e. fits into 16 bit
3470
3471
0
    __m128i sum0 = _mm_setzero_si128();
3472
0
    __m128i sum1 = _mm_setzero_si128();
3473
0
    __m128i sum2 = _mm_setzero_si128();
3474
0
    __m128i sum3 = _mm_setzero_si128();
3475
0
    __m128i sum4 = _mm_setzero_si128();
3476
3477
0
    __m128i vone = _mm_set1_epi16(1);
3478
0
    for (i = 0; i < height; i += iSubStep) {
3479
0
      for (j = 0; j < 16; j += 8) {
3480
0
        __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(piOrg + j + 0));
3481
0
        __m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(piCur + j + 0));
3482
0
        __m128i s2 = _vv_loadl_epi64(reinterpret_cast<const __m128i*>(piOrg + j + 8));
3483
0
        __m128i s3 = _vv_loadl_epi64(reinterpret_cast<const __m128i*>(piCur + j + 8));
3484
3485
0
        __m128i org0, org1, org2, org3, org4;
3486
0
        org0 = s0;
3487
0
        org1 = _mm_alignr_epi8(s2, s0, 2);
3488
0
        if (isCalCentrePos) org2 = _mm_alignr_epi8(s2, s0, 4);
3489
0
        org3 = _mm_alignr_epi8(s2, s0, 6);
3490
0
        org4 = _mm_alignr_epi8(s2, s0, 8);
3491
3492
0
        __m128i cur0, cur1, cur2, cur3, cur4;
3493
0
        cur4 = s1;
3494
0
        cur0 = _mm_alignr_epi8(s3, s1, 8);
3495
0
        cur1 = _mm_alignr_epi8(s3, s1, 6);
3496
0
        if (isCalCentrePos) cur2 = _mm_alignr_epi8(s3, s1, 4);
3497
0
        cur3 = _mm_alignr_epi8(s3, s1, 2);
3498
3499
0
        __m128i diff0, diff1, diff2, diff3, diff4;
3500
0
        diff0 = _mm_sub_epi16(org0, cur0);
3501
0
        diff1 = _mm_sub_epi16(org1, cur1);
3502
0
        if (isCalCentrePos) diff2 = _mm_sub_epi16(org2, cur2);
3503
0
        diff3 = _mm_sub_epi16(org3, cur3);
3504
0
        diff4 = _mm_sub_epi16(org4, cur4);
3505
3506
0
        diff0 = _mm_abs_epi16(diff0);
3507
0
        diff1 = _mm_abs_epi16(diff1);
3508
0
        if (isCalCentrePos) diff2 = _mm_abs_epi16(diff2);
3509
0
        diff3 = _mm_abs_epi16(diff3);
3510
0
        diff4 = _mm_abs_epi16(diff4);
3511
3512
0
        sum0 = _mm_add_epi16(sum0, diff0);
3513
0
        sum1 = _mm_add_epi16(sum1, diff1);
3514
0
        if (isCalCentrePos) sum2 = _mm_add_epi16(sum2, diff2);
3515
0
        sum3 = _mm_add_epi16(sum3, diff3);
3516
0
        sum4 = _mm_add_epi16(sum4, diff4);
3517
0
      }
3518
3519
0
      piOrg += iStrideOrg;
3520
0
      piCur += iStrideCur;
3521
0
    }
3522
3523
0
    sum0 = _mm_madd_epi16( sum0, vone );
3524
0
    sum1 = _mm_madd_epi16( sum1, vone );
3525
0
    if( isCalCentrePos ) sum2 = _mm_madd_epi16( sum2, vone );
3526
0
    sum3 = _mm_madd_epi16( sum3, vone );
3527
0
    sum4 = _mm_madd_epi16( sum4, vone );
3528
3529
0
    sum0 = _mm_hadd_epi32(sum0, sum1);
3530
0
    sum3 = _mm_hadd_epi32(sum3, sum4);
3531
0
    if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
3532
3533
0
    sum0 = _mm_hadd_epi32(sum0, sum3);
3534
0
    if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
3535
3536
0
    const __m128i vSubShift = _mm_cvtsi32_si128(iSubShift);
3537
0
    sum0 = _mm_sll_epi32(sum0, vSubShift);
3538
0
    if (isCalCentrePos) sum2 = _mm_sll_epi32(sum2, vSubShift);
3539
3540
0
    sum0 = _mm_srl_epi32(sum0, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth))));
3541
0
    if (isCalCentrePos) sum2 = _mm_srl_epi32(sum2, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth))));
3542
3543
0
    _mm_storeu_si128( ( __m128i* ) &cost[0], _mm_unpacklo_epi32( sum0, _mm_setzero_si128() ) );
3544
0
    if (isCalCentrePos) cost[2] = (_mm_cvtsi128_si32(sum2));
3545
0
    _mm_storeu_si128( ( __m128i* ) &cost[3], _mm_unpackhi_epi32( sum0, _mm_setzero_si128() ) );
3546
0
  }
3547
0
}
Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)1, true>(vvenc::DistParam const&, unsigned long*)
Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)1, false>(vvenc::DistParam const&, unsigned long*)
Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)4, true>(vvenc::DistParam const&, unsigned long*)
Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)4, false>(vvenc::DistParam const&, unsigned long*)
3548
3549
template <X86_VEXT vext>
3550
0
void RdCost::xGetSADX5_16xN_SIMD_X86(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
3551
0
  if( rcDtParam.bitDepth > 10 ){
3552
0
    RdCost::xGetSAD16X5( rcDtParam, cost, isCalCentrePos );
3553
0
    return;
3554
0
  }
3555
  
3556
0
  if (isCalCentrePos)
3557
0
    xGetSADX5_16xN_SIMDImp_X86<vext, true>(rcDtParam, cost);
3558
0
  else
3559
0
    xGetSADX5_16xN_SIMDImp_X86<vext, false>(rcDtParam, cost);
3560
0
}
Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_16xN_SIMD_X86<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&, unsigned long*, bool)
Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_16xN_SIMD_X86<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&, unsigned long*, bool)
3561
3562
template <X86_VEXT vext>
3563
void RdCost::_initRdCostX86()
3564
0
{
3565
  /* SIMD SSE implementation shifts the final sum instead of every addend
3566
   * resulting in slightly different result compared to the scalar impl. */
3567
3568
0
  m_afpDistortFunc[0][DF_SSE    ] = xGetSSE_SIMD<vext>;
3569
//m_afpDistortFunc[0][DF_SSE2   ] = xGetSSE_SIMD<vext>;
3570
0
  m_afpDistortFunc[0][DF_SSE4   ] = xGetSSE_NxN_SIMD<4,  vext>;
3571
0
  m_afpDistortFunc[0][DF_SSE8   ] = xGetSSE_NxN_SIMD<8,  vext>;
3572
0
  m_afpDistortFunc[0][DF_SSE16  ] = xGetSSE_NxN_SIMD<16, vext>;
3573
0
  m_afpDistortFunc[0][DF_SSE32  ] = xGetSSE_NxN_SIMD<32, vext>;
3574
0
  m_afpDistortFunc[0][DF_SSE64  ] = xGetSSE_NxN_SIMD<64, vext>;
3575
0
  m_afpDistortFunc[0][DF_SSE128]  = xGetSSE_NxN_SIMD<128, vext>;
3576
3577
0
  m_afpDistortFunc[0][DF_SAD    ] = xGetSAD_SIMD<vext>;
3578
//m_afpDistortFunc[0][DF_SAD2   ] = xGetSAD_SIMD<vext>;
3579
0
  m_afpDistortFunc[0][DF_SAD4   ] = xGetSAD_NxN_SIMD<4,  vext>;
3580
0
  m_afpDistortFunc[0][DF_SAD8   ] = xGetSAD_NxN_SIMD<8,  vext>;
3581
0
  m_afpDistortFunc[0][DF_SAD16  ] = xGetSAD_NxN_SIMD<16, vext>;
3582
0
  m_afpDistortFunc[0][DF_SAD32  ] = xGetSAD_NxN_SIMD<32, vext>;
3583
0
  m_afpDistortFunc[0][DF_SAD64  ] = xGetSAD_NxN_SIMD<64, vext>;
3584
0
  m_afpDistortFunc[0][DF_SAD128]  = xGetSAD_NxN_SIMD<128, vext>;
3585
3586
0
  m_afpDistortFunc[0][DF_HAD]     = RdCost::xGetHADs_SIMD<vext, false>;
3587
0
  m_afpDistortFunc[0][DF_HAD2]    = RdCost::xGetHADs_SIMD<vext, false>;
3588
0
  m_afpDistortFunc[0][DF_HAD4]    = RdCost::xGetHADs_SIMD<vext, false>;
3589
0
  m_afpDistortFunc[0][DF_HAD8]    = RdCost::xGetHADs_SIMD<vext, false>;
3590
0
  m_afpDistortFunc[0][DF_HAD16]   = RdCost::xGetHADs_SIMD<vext, false>;
3591
0
  m_afpDistortFunc[0][DF_HAD32]   = RdCost::xGetHADs_SIMD<vext, false>;
3592
0
  m_afpDistortFunc[0][DF_HAD64]   = RdCost::xGetHADs_SIMD<vext, false>;
3593
0
  m_afpDistortFunc[0][DF_HAD128]  = RdCost::xGetHADs_SIMD<vext, false>;
3594
3595
0
  m_afpDistortFunc[0][DF_HAD_fast]     = RdCost::xGetHADs_SIMD<vext, true>;
3596
0
  m_afpDistortFunc[0][DF_HAD2_fast]    = RdCost::xGetHADs_SIMD<vext, true>;
3597
0
  m_afpDistortFunc[0][DF_HAD4_fast]    = RdCost::xGetHADs_SIMD<vext, true>;
3598
0
  m_afpDistortFunc[0][DF_HAD8_fast]    = RdCost::xGetHADs_SIMD<vext, true>;
3599
0
  m_afpDistortFunc[0][DF_HAD16_fast]   = RdCost::xGetHADs_SIMD<vext, true>;
3600
0
  m_afpDistortFunc[0][DF_HAD32_fast]   = RdCost::xGetHADs_SIMD<vext, true>;
3601
0
  m_afpDistortFunc[0][DF_HAD64_fast]   = RdCost::xGetHADs_SIMD<vext, true>;
3602
0
  m_afpDistortFunc[0][DF_HAD128_fast]  = RdCost::xGetHADs_SIMD<vext, true>;
3603
3604
0
  m_afpDistortFunc[0][DF_HAD_2SAD ]     = RdCost::xGetHAD2SADs_SIMD<vext>;
3605
0
  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = xGetSADwMask_SIMD<vext>;
3606
3607
0
  m_wtdPredPtr[0] = lumaWeightedSSE_SIMD<vext, 0>;
3608
0
  m_wtdPredPtr[1] = lumaWeightedSSE_SIMD<vext, 1>;
3609
0
  m_fxdWtdPredPtr = fixWeightedSSE_SIMD <vext>;
3610
3611
0
  m_afpDistortFuncX5[0] = xGetSADX5_8xN_SIMD <vext>;
3612
0
  m_afpDistortFuncX5[1] = xGetSADX5_16xN_SIMD_X86<vext>;
3613
0
}
Unexecuted instantiation: void vvenc::RdCost::_initRdCostX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::RdCost::_initRdCostX86<(vvenc::x86_simd::X86_VEXT)4>()
3614
3615
template void RdCost::_initRdCostX86<SIMDX86>();
3616
3617
} // namespace vvenc
3618
3619
//! \}
3620
3621
#endif // TARGET_SIMD_X86
3622