Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/RdCostX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or
4
other Intellectual Property Rights other than the copyrights concerning
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     RdCostX86.cpp
44
    \brief    RD cost computation class, SIMD version
45
*/
46
47
#include <math.h>
48
#include <limits>
49
50
#include "CommonDefX86.h"
51
#include "../RdCost.h"
52
53
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_DIST
54
55
namespace vvdec
56
{
57
58
using namespace x86_simd;
59
60
template<X86_VEXT vext, bool isWdt16>
61
Distortion xGetSAD_MxN_SIMD( const DistParam &rcDtParam )
62
0
{
63
0
  if( rcDtParam.bitDepth > 10 )
64
0
    return isWdt16 ? RdCost::xGetSAD16( rcDtParam ) : RdCost::xGetSAD8( rcDtParam );
65
66
  //  assert( rcDtParam.iCols == iWidth);
67
0
  const short* pSrc1          = (const short*)rcDtParam.org.buf;
68
0
  const short* pSrc2          = (const short*)rcDtParam.cur.buf;
69
0
  const int  iRows            = rcDtParam.org.height;
70
0
  const int  iSubShift        = rcDtParam.subShift;
71
0
  const ptrdiff_t iStrideSrc1 = rcDtParam.org.stride << iSubShift;
72
0
  const ptrdiff_t iStrideSrc2 = rcDtParam.cur.stride << iSubShift;
73
74
0
  uint32_t uiSum = 0;
75
76
0
  if( vext >= AVX2 && isWdt16 )
77
0
  {
78
#ifdef USE_AVX2
79
    __m256i vone   = _mm256_set1_epi16( 1 );
80
    __m256i vsum32 = _mm256_setzero_si256();
81
    __m256i vsum16 = _mm256_setzero_si256();
82
83
    // sum of 8 unsigned 10-bit ints (0-1023) can maximally be 3 + 10 bits, i.e. fits into 16 bit
84
85
0
    for( int i = 0; i < ( iRows >> 3 ); i++ )
86
0
    {
87
      //0
88
0
      __m256i vsrc1 = _mm256_loadu_si256( ( __m256i* )( pSrc1 ) );
89
0
      __m256i vsrc2 = _mm256_loadu_si256( ( __m256i* )( pSrc2 ) );
90
91
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
92
0
      vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
93
94
      // 1
95
0
      vsrc1 = _mm256_loadu_si256( ( __m256i* )( pSrc1 ) ); vsrc2 = _mm256_loadu_si256( ( __m256i* )( pSrc2 ) );
96
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
97
0
      vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
98
99
      // 2
100
0
      vsrc1 = _mm256_loadu_si256( ( __m256i* )( pSrc1 ) ); vsrc2 = _mm256_loadu_si256( ( __m256i* )( pSrc2 ) );
101
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
102
0
      vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
103
104
      // 3
105
0
      vsrc1 = _mm256_loadu_si256( ( __m256i* )( pSrc1 ) ); vsrc2 = _mm256_loadu_si256( ( __m256i* )( pSrc2 ) );
106
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
107
0
      vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) );
108
0
    }
109
110
111
    vsum32 = _mm256_madd_epi16( vsum16, vone );
112
    vsum32 = _mm256_hadd_epi32( vsum32, vone );
113
    vsum32 = _mm256_hadd_epi32( vsum32, vone );
114
    uiSum =  _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_extracti128_si256( vsum32, 1 ) );
115
#endif
116
0
  }
117
0
  else
118
0
  {
119
0
    __m128i vone   = _mm_set1_epi16( 1 );
120
0
    __m128i vsum32 = _mm_setzero_si128();
121
0
    __m128i vsum16 = _mm_setzero_si128();
122
123
    // sum of 16 unsigned 10-bit ints (0-1023) can maximally be 4 + 10 bits, i.e. fits into 16 bit
124
125
0
    for( int i = 0; i < ( iRows >> 3 ); i++ )
126
0
    {
127
      //0
128
0
      __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
129
0
      __m128i vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
130
131
0
      vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
132
133
0
      if( isWdt16 )
134
0
      {
135
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
136
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
137
138
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
139
0
      }
140
141
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
142
143
      // 1
144
0
      vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
145
0
      vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
146
147
0
      vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
148
149
0
      if( isWdt16 )
150
0
      {
151
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
152
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
153
154
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
155
0
      }
156
157
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
158
159
      // 2
160
0
      vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
161
0
      vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
162
163
0
      vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
164
165
0
      if( isWdt16 )
166
0
      {
167
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
168
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
169
170
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
171
0
      }
172
173
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
174
175
      // 3
176
0
      vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) );
177
0
      vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) );
178
179
0
      vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
180
181
0
      if( isWdt16 )
182
0
      {
183
0
        vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) );
184
0
        vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) );
185
186
0
        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
187
0
      }
188
189
0
      pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2;
190
0
    }
191
192
193
0
    vsum32 = _mm_madd_epi16( vsum16, vone );
194
0
    vsum32 = _mm_hadd_epi32( vsum32, vone );
195
0
    vsum32 = _mm_hadd_epi32( vsum32, vone );
196
0
    uiSum = _mm_cvtsi128_si32( vsum32 );
197
0
  }
198
199
0
  uiSum <<= iSubShift;
200
0
  return uiSum;
201
0
}
Unexecuted instantiation: unsigned int vvdec::xGetSAD_MxN_SIMD<(vvdec::x86_simd::X86_VEXT)1, false>(vvdec::DistParam const&)
Unexecuted instantiation: unsigned int vvdec::xGetSAD_MxN_SIMD<(vvdec::x86_simd::X86_VEXT)1, true>(vvdec::DistParam const&)
Unexecuted instantiation: unsigned int vvdec::xGetSAD_MxN_SIMD<(vvdec::x86_simd::X86_VEXT)4, false>(vvdec::DistParam const&)
Unexecuted instantiation: unsigned int vvdec::xGetSAD_MxN_SIMD<(vvdec::x86_simd::X86_VEXT)4, true>(vvdec::DistParam const&)
202
203
template <X86_VEXT vext, bool isCalCentrePos>
204
0
void xGetSADX5_8xN_SIMDImp(const DistParam& rcDtParam, Distortion* cost) {
205
0
  int i;
206
0
  const Pel* piOrg = rcDtParam.org.buf;
207
0
  const Pel* piCur = rcDtParam.cur.buf - 4;
208
0
  int height = rcDtParam.org.height;
209
0
  int iSubShift = rcDtParam.subShift;
210
0
  int iSubStep = (1 << iSubShift);
211
0
  ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep;
212
0
  ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep;
213
214
0
  __m128i sum0 = _mm_setzero_si128();
215
0
  __m128i sum1 = _mm_setzero_si128();
216
0
  __m128i sum2 = _mm_setzero_si128();
217
0
  __m128i sum3 = _mm_setzero_si128();
218
0
  __m128i sum4 = _mm_setzero_si128();
219
220
0
  __m128i vone = _mm_set1_epi16(1);
221
0
  for (i = 0; i < height; i += iSubStep) {
222
0
    __m128i s0 = _mm_loadu_si128((__m128i*)piOrg);
223
0
    __m128i s1 = _mm_loadu_si128((__m128i*)piCur);
224
0
    __m128i s2 = _mm_loadu_si64 ((__m128i*)(piOrg + 8));
225
0
    __m128i s3 = _mm_loadu_si64 ((__m128i*)(piCur + 8));
226
227
0
    __m128i org0, org1, org2, org3, org4;
228
0
    org0 = s0;
229
0
    org1 = _mm_alignr_epi8(s2, s0, 2);
230
0
    if (isCalCentrePos) org2 = _mm_alignr_epi8(s2, s0, 4);
231
0
    org3 = _mm_alignr_epi8(s2, s0, 6);
232
0
    org4 = _mm_alignr_epi8(s2, s0, 8);
233
234
0
    __m128i cur0, cur1, cur2, cur3, cur4;
235
0
    cur4 = s1;
236
0
    cur0 = _mm_alignr_epi8(s3, s1, 8);
237
0
    cur1 = _mm_alignr_epi8(s3, s1, 6);
238
0
    if (isCalCentrePos) cur2 = _mm_alignr_epi8(s3, s1, 4);
239
0
    cur3 = _mm_alignr_epi8(s3, s1, 2);
240
241
0
    __m128i diff0, diff1, diff2, diff3, diff4;
242
0
    diff0 = _mm_sub_epi16(org0, cur0);
243
0
    diff1 = _mm_sub_epi16(org1, cur1);
244
0
    if (isCalCentrePos) diff2 = _mm_sub_epi16(org2, cur2);
245
0
    diff3 = _mm_sub_epi16(org3, cur3);
246
0
    diff4 = _mm_sub_epi16(org4, cur4);
247
248
0
    diff0 = _mm_abs_epi16(diff0);
249
0
    diff1 = _mm_abs_epi16(diff1);
250
0
    if (isCalCentrePos) diff2 = _mm_abs_epi16(diff2);
251
0
    diff3 = _mm_abs_epi16(diff3);
252
0
    diff4 = _mm_abs_epi16(diff4);
253
254
0
    sum0 = _mm_add_epi16(sum0, diff0);
255
0
    sum1 = _mm_add_epi16(sum1, diff1);
256
0
    if (isCalCentrePos) sum2 = _mm_add_epi32(sum2, diff2);
257
0
    sum3 = _mm_add_epi16(sum3, diff3);
258
0
    sum4 = _mm_add_epi16(sum4, diff4);
259
260
0
    INCY(piOrg, iStrideOrg);
261
0
    INCY(piCur, iStrideCur);
262
0
  }
263
264
0
  sum0 = _mm_madd_epi16( sum0, vone );
265
0
  sum1 = _mm_madd_epi16( sum1, vone );
266
0
  if( isCalCentrePos ) sum2 = _mm_madd_epi16( sum2, vone );
267
0
  sum3 = _mm_madd_epi16( sum3, vone );
268
0
  sum4 = _mm_madd_epi16( sum4, vone );
269
270
0
  sum0 = _mm_hadd_epi32(sum0, sum1);
271
0
  sum3 = _mm_hadd_epi32(sum3, sum4);
272
0
  if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
273
274
0
  sum0 = _mm_hadd_epi32(sum0, sum3);
275
0
  if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
276
277
0
  sum0 = _mm_sll_epi32(sum0, _mm_cvtsi32_si128(iSubShift));
278
0
  if (isCalCentrePos) sum2 = _mm_sll_epi32(sum2, _mm_cvtsi32_si128(iSubShift));
279
280
0
  sum0 = _mm_srli_epi32(sum0, 1);
281
0
  if (isCalCentrePos) sum2 = _mm_srli_epi32(sum2, 1);
282
283
0
  _mm_storeu_si64( ( __m128i* ) &cost[0], sum0 );
284
0
  if (isCalCentrePos) cost[2] = (_mm_cvtsi128_si32(sum2));
285
0
  _mm_storeu_si64( ( __m128i* ) &cost[3], _mm_unpackhi_epi64( sum0, sum0 ) );
286
0
}
Unexecuted instantiation: void vvdec::xGetSADX5_8xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)1, true>(vvdec::DistParam const&, unsigned int*)
Unexecuted instantiation: void vvdec::xGetSADX5_8xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)1, false>(vvdec::DistParam const&, unsigned int*)
Unexecuted instantiation: void vvdec::xGetSADX5_8xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)4, true>(vvdec::DistParam const&, unsigned int*)
Unexecuted instantiation: void vvdec::xGetSADX5_8xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)4, false>(vvdec::DistParam const&, unsigned int*)
287
288
template <X86_VEXT vext>
289
0
void xGetSADX5_8xN_SIMD(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
290
0
  if( rcDtParam.bitDepth > 10 ){
291
0
    RdCost::xGetSAD16X5( rcDtParam, cost, isCalCentrePos );
292
0
    return;
293
0
  }
294
295
0
  if (isCalCentrePos)
296
0
    xGetSADX5_8xN_SIMDImp<vext, true>(rcDtParam, cost);
297
0
  else
298
0
    xGetSADX5_8xN_SIMDImp<vext, false>(rcDtParam, cost);
299
0
}
Unexecuted instantiation: void vvdec::xGetSADX5_8xN_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::DistParam const&, unsigned int*, bool)
Unexecuted instantiation: void vvdec::xGetSADX5_8xN_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::DistParam const&, unsigned int*, bool)
300
301
template <X86_VEXT vext, bool isCalCentrePos>
302
0
void xGetSADX5_16xN_SIMDImp(const DistParam& rcDtParam, Distortion* cost) {
303
0
  int i, j;
304
0
  const Pel* piOrg = rcDtParam.org.buf;
305
0
  const Pel* piCur = rcDtParam.cur.buf - 4;
306
0
  int height = rcDtParam.org.height;
307
0
  int iSubShift = rcDtParam.subShift;
308
0
  int iSubStep = (1 << iSubShift);
309
0
  ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep;
310
0
  ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep;
311
312
#  ifdef USE_AVX2
313
0
  if (vext >= AVX2) {
314
    // sum of 8 unsigned 10-bit ints (0-1023) can maximally be 3 + 10 bits, i.e. fits into 16 bit
315
316
    __m256i sum0 = _mm256_setzero_si256();
317
    __m256i sum1 = _mm256_setzero_si256();
318
    __m256i sum2 = _mm256_setzero_si256();
319
    __m256i sum3 = _mm256_setzero_si256();
320
    __m256i sum4 = _mm256_setzero_si256();
321
322
    __m256i vone = _mm256_set1_epi16(1);
323
324
0
    for (int i = 0; i < ( height >> 3 ); i++) {
325
0
      __m256i s0 = _mm256_loadu_si256((__m256i*)piOrg);
326
0
      __m256i s1 = _mm256_loadu_si256((__m256i*)piCur);
327
0
      __m256i s2 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piOrg + 16)));
328
0
      __m256i s3 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piCur + 16)));
329
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
330
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
331
332
0
      INCY(piOrg, iStrideOrg);
333
0
      INCY(piCur, iStrideCur);
334
335
0
      __m256i org0, org1, org2, org3, org4;
336
0
      org0 = s0;
337
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
338
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
339
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
340
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
341
342
0
      __m256i cur0, cur1, cur2, cur3, cur4;
343
0
      cur4 = s1;
344
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
345
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
346
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
347
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
348
349
0
      __m256i diff0, diff1, diff2, diff3, diff4;
350
0
      diff0 = _mm256_sub_epi16(org0, cur0);
351
0
      diff1 = _mm256_sub_epi16(org1, cur1);
352
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
353
0
      diff3 = _mm256_sub_epi16(org3, cur3);
354
0
      diff4 = _mm256_sub_epi16(org4, cur4);
355
356
0
      diff0 = _mm256_abs_epi16( diff0 );
357
0
      diff1 = _mm256_abs_epi16( diff1 );
358
0
      if( isCalCentrePos ) diff2 = _mm256_abs_epi16( diff2 );
359
0
      diff3 = _mm256_abs_epi16( diff3 );
360
0
      diff4 = _mm256_abs_epi16( diff4 );
361
362
0
      sum0 = _mm256_add_epi16( diff0, sum0 );
363
0
      sum1 = _mm256_add_epi16( diff1, sum1 );
364
0
      if( isCalCentrePos ) sum2 = _mm256_add_epi16( diff2, sum2 );
365
0
      sum3 = _mm256_add_epi16( diff3, sum3 );
366
0
      sum4 = _mm256_add_epi16( diff4, sum4 );
367
368
0
      s0 = _mm256_loadu_si256((__m256i*)piOrg);
369
0
      s1 = _mm256_loadu_si256((__m256i*)piCur);
370
0
      s2 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piOrg + 16)));
371
0
      s3 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piCur + 16)));
372
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
373
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
374
375
0
      INCY(piOrg, iStrideOrg);
376
0
      INCY(piCur, iStrideCur);
377
378
0
      org0 = s0;
379
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
380
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
381
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
382
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
383
384
0
      cur4 = s1;
385
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
386
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
387
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
388
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
389
390
0
      diff0 = _mm256_sub_epi16(org0, cur0);
391
0
      diff1 = _mm256_sub_epi16(org1, cur1);
392
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
393
0
      diff3 = _mm256_sub_epi16(org3, cur3);
394
0
      diff4 = _mm256_sub_epi16(org4, cur4);
395
396
0
      diff0 = _mm256_abs_epi16(diff0);
397
0
      diff1 = _mm256_abs_epi16(diff1);
398
0
      if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2);
399
0
      diff3 = _mm256_abs_epi16(diff3);
400
0
      diff4 = _mm256_abs_epi16(diff4);
401
402
0
      sum0 = _mm256_add_epi16(diff0, sum0);
403
0
      sum1 = _mm256_add_epi16(diff1, sum1);
404
0
      if (isCalCentrePos) sum2 = _mm256_add_epi16(diff2, sum2);
405
0
      sum3 = _mm256_add_epi16(diff3, sum3);
406
0
      sum4 = _mm256_add_epi16(diff4, sum4);
407
408
0
      s0 = _mm256_loadu_si256((__m256i*)piOrg);
409
0
      s1 = _mm256_loadu_si256((__m256i*)piCur);
410
0
      s2 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piOrg + 16)));
411
0
      s3 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piCur + 16)));
412
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
413
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
414
415
0
      INCY(piOrg, iStrideOrg);
416
0
      INCY(piCur, iStrideCur);
417
418
0
      org0 = s0;
419
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
420
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
421
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
422
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
423
424
0
      cur4 = s1;
425
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
426
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
427
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
428
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
429
430
0
      diff0 = _mm256_sub_epi16(org0, cur0);
431
0
      diff1 = _mm256_sub_epi16(org1, cur1);
432
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
433
0
      diff3 = _mm256_sub_epi16(org3, cur3);
434
0
      diff4 = _mm256_sub_epi16(org4, cur4);
435
436
0
      diff0 = _mm256_abs_epi16(diff0);
437
0
      diff1 = _mm256_abs_epi16(diff1);
438
0
      if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2);
439
0
      diff3 = _mm256_abs_epi16(diff3);
440
0
      diff4 = _mm256_abs_epi16(diff4);
441
442
0
      sum0 = _mm256_add_epi16( diff0, sum0 );
443
0
      sum1 = _mm256_add_epi16( diff1, sum1 );
444
0
      if( isCalCentrePos ) sum2 = _mm256_add_epi16( diff2, sum2 );
445
0
      sum3 = _mm256_add_epi16( diff3, sum3 );
446
0
      sum4 = _mm256_add_epi16( diff4, sum4 );
447
448
0
      s0 = _mm256_loadu_si256((__m256i*)piOrg);
449
0
      s1 = _mm256_loadu_si256((__m256i*)piCur);
450
0
      s2 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piOrg + 16)));
451
0
      s3 = _mm256_castsi128_si256(_mm_loadu_si64((__m128i*)(piCur + 16)));
452
0
      s2 = _mm256_permute2x128_si256(s0, s2, 0x21);
453
0
      s3 = _mm256_permute2x128_si256(s1, s3, 0x21);
454
455
0
      INCY(piOrg, iStrideOrg);
456
0
      INCY(piCur, iStrideCur);
457
458
0
      org0 = s0;
459
0
      org1 = _mm256_alignr_epi8(s2, s0, 2);
460
0
      if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4);
461
0
      org3 = _mm256_alignr_epi8(s2, s0, 6);
462
0
      org4 = _mm256_alignr_epi8(s2, s0, 8);
463
464
0
      cur4 = s1;
465
0
      cur0 = _mm256_alignr_epi8(s3, s1, 8);
466
0
      cur1 = _mm256_alignr_epi8(s3, s1, 6);
467
0
      if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4);
468
0
      cur3 = _mm256_alignr_epi8(s3, s1, 2);
469
470
0
      diff0 = _mm256_sub_epi16(org0, cur0);
471
0
      diff1 = _mm256_sub_epi16(org1, cur1);
472
0
      if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2);
473
0
      diff3 = _mm256_sub_epi16(org3, cur3);
474
0
      diff4 = _mm256_sub_epi16(org4, cur4);
475
476
0
      diff0 = _mm256_abs_epi16(diff0);
477
0
      diff1 = _mm256_abs_epi16(diff1);
478
0
      if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2);
479
0
      diff3 = _mm256_abs_epi16(diff3);
480
0
      diff4 = _mm256_abs_epi16(diff4);
481
482
0
      sum0 = _mm256_add_epi16(diff0, sum0);
483
0
      sum1 = _mm256_add_epi16(diff1, sum1);
484
0
      if (isCalCentrePos) sum2 = _mm256_add_epi16(diff2, sum2);
485
0
      sum3 = _mm256_add_epi16(diff3, sum3);
486
0
      sum4 = _mm256_add_epi16(diff4, sum4);
487
0
    }
488
489
    sum0 = _mm256_madd_epi16( sum0, vone );
490
    sum1 = _mm256_madd_epi16( sum1, vone );
491
0
    if( isCalCentrePos ) sum2 = _mm256_madd_epi16( sum2, vone );
492
    sum3 = _mm256_madd_epi16( sum3, vone );
493
    sum4 = _mm256_madd_epi16( sum4, vone );
494
495
    sum0 = _mm256_hadd_epi32(sum0, sum1);
496
    sum3 = _mm256_hadd_epi32(sum3, sum4);
497
0
    if (isCalCentrePos) sum2 = _mm256_hadd_epi32(sum2, sum2);
498
499
    sum0 = _mm256_hadd_epi32(sum0, sum3);
500
0
    if (isCalCentrePos) sum2 = _mm256_hadd_epi32(sum2, sum2);
501
502
    __m128i sum0134 = _mm_add_epi32(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1));
503
504
    sum0134 = _mm_sll_epi32(sum0134, _mm_cvtsi32_si128(iSubShift));
505
506
    sum0134 = _mm_srli_epi32(sum0134, 1);
507
508
0
    _mm_storeu_si64( ( __m128i* ) &cost[0], sum0134 );
509
0
    if (isCalCentrePos) {
510
0
      int tmp = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum2)) + _mm256_extract_epi32(sum2, 4);
511
0
      tmp <<= iSubShift;
512
0
      tmp >>= 1;
513
0
      cost[2] = tmp;
514
0
    }
515
0
    _mm_storeu_si64( ( __m128i* ) &cost[3], _mm_unpackhi_epi64( sum0134, sum0134 ) );
516
0
  }
517
0
  else
518
0
#  endif
519
0
  {
520
    // sum of 16 unsigned 10-bit ints (0-1023) can maximally be 4 + 10 bits, i.e. fits into 16 bit
521
522
0
    __m128i sum0 = _mm_setzero_si128();
523
0
    __m128i sum1 = _mm_setzero_si128();
524
0
    __m128i sum2 = _mm_setzero_si128();
525
0
    __m128i sum3 = _mm_setzero_si128();
526
0
    __m128i sum4 = _mm_setzero_si128();
527
528
0
    __m128i vone = _mm_set1_epi16(1);
529
0
    for (i = 0; i < height; i += iSubStep) {
530
0
      for (j = 0; j < 16; j += 8) {
531
0
        __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(piOrg + j + 0));
532
0
        __m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(piCur + j + 0));
533
0
        __m128i s2 = _mm_loadu_si64 (reinterpret_cast<const __m128i*>(piOrg + j + 8));
534
0
        __m128i s3 = _mm_loadu_si64 (reinterpret_cast<const __m128i*>(piCur + j + 8));
535
536
0
        __m128i org0, org1, org2, org3, org4;
537
0
        org0 = s0;
538
0
        org1 = _mm_alignr_epi8(s2, s0, 2);
539
0
        if (isCalCentrePos) org2 = _mm_alignr_epi8(s2, s0, 4);
540
0
        org3 = _mm_alignr_epi8(s2, s0, 6);
541
0
        org4 = _mm_alignr_epi8(s2, s0, 8);
542
543
0
        __m128i cur0, cur1, cur2, cur3, cur4;
544
0
        cur4 = s1;
545
0
        cur0 = _mm_alignr_epi8(s3, s1, 8);
546
0
        cur1 = _mm_alignr_epi8(s3, s1, 6);
547
0
        if (isCalCentrePos) cur2 = _mm_alignr_epi8(s3, s1, 4);
548
0
        cur3 = _mm_alignr_epi8(s3, s1, 2);
549
550
0
        __m128i diff0, diff1, diff2, diff3, diff4;
551
0
        diff0 = _mm_sub_epi16(org0, cur0);
552
0
        diff1 = _mm_sub_epi16(org1, cur1);
553
0
        if (isCalCentrePos) diff2 = _mm_sub_epi16(org2, cur2);
554
0
        diff3 = _mm_sub_epi16(org3, cur3);
555
0
        diff4 = _mm_sub_epi16(org4, cur4);
556
557
0
        diff0 = _mm_abs_epi16(diff0);
558
0
        diff1 = _mm_abs_epi16(diff1);
559
0
        if (isCalCentrePos) diff2 = _mm_abs_epi16(diff2);
560
0
        diff3 = _mm_abs_epi16(diff3);
561
0
        diff4 = _mm_abs_epi16(diff4);
562
563
0
        sum0 = _mm_add_epi16(sum0, diff0);
564
0
        sum1 = _mm_add_epi16(sum1, diff1);
565
0
        if (isCalCentrePos) sum2 = _mm_add_epi16(sum2, diff2);
566
0
        sum3 = _mm_add_epi16(sum3, diff3);
567
0
        sum4 = _mm_add_epi16(sum4, diff4);
568
0
      }
569
570
0
      INCY(piOrg, iStrideOrg);
571
0
      INCY(piCur, iStrideCur);
572
0
    }
573
574
0
    sum0 = _mm_madd_epi16( sum0, vone );
575
0
    sum1 = _mm_madd_epi16( sum1, vone );
576
0
    if( isCalCentrePos ) sum2 = _mm_madd_epi16( sum2, vone );
577
0
    sum3 = _mm_madd_epi16( sum3, vone );
578
0
    sum4 = _mm_madd_epi16( sum4, vone );
579
580
0
    sum0 = _mm_hadd_epi32(sum0, sum1);
581
0
    sum3 = _mm_hadd_epi32(sum3, sum4);
582
0
    if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
583
584
0
    sum0 = _mm_hadd_epi32(sum0, sum3);
585
0
    if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2);
586
587
0
    sum0 = _mm_sll_epi32(sum0, _mm_cvtsi32_si128(iSubShift));
588
0
    if (isCalCentrePos) sum2 = _mm_sll_epi32(sum2, _mm_cvtsi32_si128(iSubShift));
589
590
0
    sum0 = _mm_srli_epi32(sum0, 1);
591
0
    if (isCalCentrePos) sum2 = _mm_srli_epi32(sum2, 1);
592
593
0
    _mm_storeu_si64( ( __m128i* ) &cost[0], sum0 );
594
0
    if (isCalCentrePos) cost[2] = (_mm_cvtsi128_si32(sum2));
595
0
    _mm_storeu_si64( ( __m128i* ) &cost[3], _mm_unpackhi_epi64( sum0, sum0 ) );
596
0
  }
597
0
}
Unexecuted instantiation: void vvdec::xGetSADX5_16xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)1, true>(vvdec::DistParam const&, unsigned int*)
Unexecuted instantiation: void vvdec::xGetSADX5_16xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)1, false>(vvdec::DistParam const&, unsigned int*)
Unexecuted instantiation: void vvdec::xGetSADX5_16xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)4, true>(vvdec::DistParam const&, unsigned int*)
Unexecuted instantiation: void vvdec::xGetSADX5_16xN_SIMDImp<(vvdec::x86_simd::X86_VEXT)4, false>(vvdec::DistParam const&, unsigned int*)
598
599
template <X86_VEXT vext>
600
0
void xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
601
0
  if( rcDtParam.bitDepth > 10 ){
602
0
    RdCost::xGetSAD16X5( rcDtParam, cost, isCalCentrePos );
603
0
    return;
604
0
  }
605
606
0
  if (isCalCentrePos)
607
0
    xGetSADX5_16xN_SIMDImp<vext, true>(rcDtParam, cost);
608
0
  else
609
0
    xGetSADX5_16xN_SIMDImp<vext, false>(rcDtParam, cost);
610
0
}
Unexecuted instantiation: void vvdec::xGetSADX5_16xN_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::DistParam const&, unsigned int*, bool)
Unexecuted instantiation: void vvdec::xGetSADX5_16xN_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::DistParam const&, unsigned int*, bool)
611
612
template <X86_VEXT vext>
613
void RdCost::_initRdCostX86()
614
0
{
615
0
  m_afpDistortFunc[DF_SAD8   ] = xGetSAD_MxN_SIMD<vext, false>;
616
0
  m_afpDistortFunc[DF_SAD16  ] = xGetSAD_MxN_SIMD<vext, true>;
617
618
0
  m_afpDistortFuncX5[DF_SAD8] = xGetSADX5_8xN_SIMD<vext>;
619
0
  m_afpDistortFuncX5[DF_SAD16] = xGetSADX5_16xN_SIMD<vext>;
620
0
}
Unexecuted instantiation: void vvdec::RdCost::_initRdCostX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::RdCost::_initRdCostX86<(vvdec::x86_simd::X86_VEXT)4>()
621
622
template void RdCost::_initRdCostX86<SIMDX86>();
623
624
}
625
#endif //#if TARGET_SIMD_X86