Coverage Report

Created: 2026-06-10 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/CommonLib/RdCost.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     RdCost.cpp
45
    \brief    RD cost computation class
46
*/
47
48
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP
49
50
#include "RdCost.h"
51
#include "Rom.h"
52
#include "UnitPartitioner.h"
53
#include "SearchSpaceCounter.h"
54
55
56
//! \ingroup CommonLib
57
//! \{
58
59
namespace vvenc {
60
61
62
template<int csx>
63
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights );
64
65
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedWeight );
66
67
RdCost::RdCost()
68
17.7k
  : m_afpDistortFunc{ { nullptr, }, { nullptr, } }
69
17.7k
{
70
17.7k
}
71
72
RdCost::~RdCost()
73
17.7k
{
74
17.7k
}
75
76
void RdCost::setLambda( double dLambda, const BitDepths &bitDepths )
77
15.7k
{
78
15.7k
  m_dLambda          = dLambda;
79
15.7k
  m_DistScale        = double(1<<SCALE_BITS) / m_dLambda;
80
15.7k
  m_dLambdaMotionSAD = sqrt(m_dLambda);
81
15.7k
}
82
83
84
// Initialize Function Pointer by [eDFunc]
85
void RdCost::create( bool enableOpt )
86
17.7k
{
87
17.7k
  m_signalType                 = RESHAPE_SIGNAL_NULL;
88
17.7k
  m_chromaWeight               = 1.0;
89
17.7k
  m_lumaBD                     = 10;
90
17.7k
  m_afpDistortFunc[0][DF_SSE    ] = RdCost::xGetSSE;
91
17.7k
  m_afpDistortFunc[0][DF_SSE2   ] = RdCost::xGetSSE;
92
17.7k
  m_afpDistortFunc[0][DF_SSE4   ] = RdCost::xGetSSE4;
93
17.7k
  m_afpDistortFunc[0][DF_SSE8   ] = RdCost::xGetSSE8;
94
17.7k
  m_afpDistortFunc[0][DF_SSE16  ] = RdCost::xGetSSE16;
95
17.7k
  m_afpDistortFunc[0][DF_SSE32  ] = RdCost::xGetSSE32;
96
17.7k
  m_afpDistortFunc[0][DF_SSE64  ] = RdCost::xGetSSE64;
97
17.7k
  m_afpDistortFunc[0][DF_SSE128 ] = RdCost::xGetSSE128;
98
99
17.7k
  m_afpDistortFunc[0][DF_SAD    ] = RdCost::xGetSAD;
100
17.7k
  m_afpDistortFunc[0][DF_SAD2   ] = RdCost::xGetSAD;
101
17.7k
  m_afpDistortFunc[0][DF_SAD4   ] = RdCost::xGetSAD4;
102
17.7k
  m_afpDistortFunc[0][DF_SAD8   ] = RdCost::xGetSAD8;
103
17.7k
  m_afpDistortFunc[0][DF_SAD16  ] = RdCost::xGetSAD16;
104
17.7k
  m_afpDistortFunc[0][DF_SAD32  ] = RdCost::xGetSAD32;
105
17.7k
  m_afpDistortFunc[0][DF_SAD64  ] = RdCost::xGetSAD64;
106
17.7k
  m_afpDistortFunc[0][DF_SAD128 ] = RdCost::xGetSAD128;
107
108
17.7k
  m_afpDistortFunc[0][DF_HAD    ] = RdCost::xGetHADs<false>;
109
17.7k
  m_afpDistortFunc[0][DF_HAD2   ] = RdCost::xGetHADs<false>;
110
17.7k
  m_afpDistortFunc[0][DF_HAD4   ] = RdCost::xGetHADs<false>;
111
17.7k
  m_afpDistortFunc[0][DF_HAD8   ] = RdCost::xGetHADs<false>;
112
17.7k
  m_afpDistortFunc[0][DF_HAD16  ] = RdCost::xGetHADs<false>;
113
17.7k
  m_afpDistortFunc[0][DF_HAD32  ] = RdCost::xGetHADs<false>;
114
17.7k
  m_afpDistortFunc[0][DF_HAD64  ] = RdCost::xGetHADs<false>;
115
17.7k
  m_afpDistortFunc[0][DF_HAD128 ] = RdCost::xGetHADs<false>;
116
117
17.7k
  m_afpDistortFunc[0][DF_HAD_fast    ] = RdCost::xGetHADs<true>;
118
17.7k
  m_afpDistortFunc[0][DF_HAD2_fast   ] = RdCost::xGetHADs<true>;
119
17.7k
  m_afpDistortFunc[0][DF_HAD4_fast   ] = RdCost::xGetHADs<true>;
120
17.7k
  m_afpDistortFunc[0][DF_HAD8_fast   ] = RdCost::xGetHADs<true>;
121
17.7k
  m_afpDistortFunc[0][DF_HAD16_fast  ] = RdCost::xGetHADs<true>;
122
17.7k
  m_afpDistortFunc[0][DF_HAD32_fast  ] = RdCost::xGetHADs<true>;
123
17.7k
  m_afpDistortFunc[0][DF_HAD64_fast  ] = RdCost::xGetHADs<true>;
124
17.7k
  m_afpDistortFunc[0][DF_HAD128_fast ] = RdCost::xGetHADs<true>;
125
126
  //  m_afpDistortFunc[0][DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;
127
17.7k
  m_afpDistortFunc[0][DF_HAD_2SAD ] = RdCost::xGetHAD2SADs;
128
129
17.7k
  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = RdCost::xGetSADwMask;
130
  // m_afpDistortFunc[1] can be used in any case
131
17.7k
  memcpy( m_afpDistortFunc[1], m_afpDistortFunc[0], sizeof(m_afpDistortFunc)/2);
132
133
17.7k
  m_wtdPredPtr[0] = lumaWeightedSSE_Core<0>;
134
17.7k
  m_wtdPredPtr[1] = lumaWeightedSSE_Core<1>;
135
17.7k
  m_fxdWtdPredPtr = fixWeightedSSE_Core;
136
137
17.7k
  m_afpDistortFuncX5[0] = RdCost::xGetSAD8X5;
138
17.7k
  m_afpDistortFuncX5[1] = RdCost::xGetSAD16X5;
139
140
17.7k
#if ENABLE_SIMD_OPT_DIST
141
17.7k
  if( enableOpt )
142
17.7k
  {
143
#ifdef TARGET_SIMD_X86
144
    initRdCostX86();
145
#endif
146
#ifdef TARGET_SIMD_ARM
147
    initRdCostARM();
148
#endif
149
17.7k
  }
150
17.7k
#endif
151
152
17.7k
  m_costMode      = VVENC_COST_STANDARD_LOSSY;
153
17.7k
  m_motionLambda  = 0;
154
17.7k
  m_iCostScale    = 0;
155
17.7k
}
156
157
#if ENABLE_MEASURE_SEARCH_SPACE
158
static Distortion xMeasurePredSearchSpaceInterceptor( const DistParam& dp )
159
{
160
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
161
  return dp.xDistFunc( dp );
162
}
163
164
#endif
165
void RdCost::setDistParam( DistParam &rcDP, const CPelBuf& org, const Pel* piRefY, int iRefStride, int bitDepth, ComponentID compID, int subShiftMode, int useHadamard )
166
46.2k
{
167
46.2k
  rcDP.bitDepth   = bitDepth;
168
46.2k
  rcDP.compID     = compID;
169
170
  // set Original & Curr Pointer / Stride
171
46.2k
  rcDP.org        = org;
172
173
46.2k
  rcDP.cur.buf    = piRefY;
174
46.2k
  rcDP.cur.stride = iRefStride;
175
176
  // set Block Width / Height
177
46.2k
  rcDP.cur.width    = org.width;
178
46.2k
  rcDP.cur.height   = org.height;
179
46.2k
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
180
181
46.2k
  const int base = (rcDP.bitDepth > 10 || rcDP.applyWeight) ? 1 : 0;
182
46.2k
  if( !useHadamard )
183
46.2k
  {
184
46.2k
    rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( org.width ) ];
185
46.2k
  }
186
0
  else
187
0
  {
188
0
    rcDP.distFunc = m_afpDistortFunc[base][( useHadamard == 1 ? DF_HAD : DF_HAD_fast ) + Log2( org.width ) ];
189
0
  }
190
191
  // initialize
192
46.2k
  rcDP.subShift  = 0;
193
194
46.2k
  if( subShiftMode == 1 )
195
0
  {
196
0
    if( rcDP.org.height > 8 && rcDP.org.width <= 128 )
197
0
    {
198
0
      rcDP.subShift = 1;
199
0
    }
200
0
  }
201
46.2k
  else if( subShiftMode == 2 )
202
0
  {
203
0
    if (rcDP.org.height > 8)
204
0
    {
205
0
      rcDP.subShift = 1;
206
0
    }
207
0
  }
208
209
#if ENABLE_MEASURE_SEARCH_SPACE
210
  rcDP.xDistFunc = rcDP.distFunc;
211
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
212
#endif
213
46.2k
}
214
215
216
DistParam RdCost::setDistParam( const CPelBuf& org, const CPelBuf& cur, int bitDepth, DFunc dfunc )
217
254k
{
218
254k
  int index = dfunc;
219
254k
  if( dfunc != DF_HAD && dfunc != DF_HAD_fast && dfunc != DF_HAD_2SAD )
220
104k
  {
221
104k
    index += Log2(org.width);
222
104k
  }
223
224
254k
  const int base = bitDepth > 10 ? 1:0; //TBD: check does SDA ever overflow
225
#if ENABLE_MEASURE_SEARCH_SPACE
226
  DistParam rcDP( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
227
  rcDP.xDistFunc = rcDP.distFunc;
228
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
229
  return rcDP;
230
#else
231
254k
  return DistParam( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
232
254k
#endif
233
254k
}
234
235
DistParam RdCost::setDistParam( const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShift, bool isDMVR )
236
0
{
237
0
  DistParam rcDP;
238
0
  rcDP.bitDepth   = bitDepth;
239
0
  rcDP.compID     = compID;
240
241
0
  rcDP.org.buf    = pOrg;
242
0
  rcDP.org.stride = iOrgStride;
243
0
  rcDP.org.width  = width;
244
0
  rcDP.org.height = height;
245
246
0
  rcDP.cur.buf    = piRefY;
247
0
  rcDP.cur.stride = iRefStride;
248
0
  rcDP.cur.width  = width;
249
0
  rcDP.cur.height = height;
250
0
  rcDP.subShift   = subShift;
251
252
  //  CHECK( useHadamard || rcDP.useMR, "only used in xDMVRCost with these default parameters (so far...)" );
253
0
  const int base = (rcDP.bitDepth > 10) ? 1 : 0;
254
255
0
  rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( width ) ];
256
  
257
0
  if( isDMVR )
258
0
  {
259
0
    rcDP.dmvrSadX5 = m_afpDistortFuncX5[Log2( width ) - 3];
260
0
  }
261
262
#if ENABLE_MEASURE_SEARCH_SPACE
263
  if( !isDMVR )
264
  {
265
    // DMVT is part of the decoder complexity
266
    rcDP.xDistFunc = rcDP.distFunc;
267
    rcDP.distFunc = xMeasurePredSearchSpaceInterceptor;
268
  }
269
270
#endif
271
0
  return rcDP;
272
0
}
273
274
Distortion RdCost::getDistPart( const CPelBuf& org, const CPelBuf& cur, int bitDepth, const ComponentID compId, DFunc eDFunc, const CPelBuf* orgLuma )
275
2.49M
{
276
2.49M
  DistParam dp( org, cur, nullptr, bitDepth, 0, compId );
277
# if ENABLE_MEASURE_SEARCH_SPACE
278
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
279
#endif
280
2.49M
  Distortion dist;
281
2.49M
  if( orgLuma )
282
0
  {
283
0
    CHECKD( eDFunc != DF_SSE_WTD, "mismatch func and parameter")
284
0
    dp.orgLuma  = orgLuma;
285
0
    dist = RdCost::xGetSSE_WTD( dp );
286
0
  }
287
2.49M
  else
288
2.49M
  {
289
2.49M
    if( ( org.width == 1 ) )
290
0
    {
291
0
      dist = xGetSSE( dp );
292
0
    }
293
2.49M
    else
294
2.49M
    {
295
2.49M
      const int base = (bitDepth > 10) ? 1 : 0;
296
2.49M
      dist = m_afpDistortFunc[base][eDFunc + Log2(org.width)](dp);
297
2.49M
    }
298
2.49M
  }
299
2.49M
  if (isChroma(compId))
300
2.09M
  {
301
2.09M
    return ((Distortion) (m_distortionWeight[ compId ] * dist));
302
2.09M
  }
303
403k
  else
304
403k
  {
305
403k
    return dist;
306
403k
  }
307
2.49M
}
308
309
// ====================================================================================================================
310
// Distortion functions
311
// ====================================================================================================================
312
313
// --------------------------------------------------------------------------------------------------------------------
314
// SAD
315
// --------------------------------------------------------------------------------------------------------------------
316
317
Distortion RdCost::xGetSAD( const DistParam& rcDtParam )
318
0
{
319
0
  if ( rcDtParam.applyWeight )
320
0
  {
321
0
    THROW(" no support");
322
0
  }
323
324
0
  const Pel* piOrg           = rcDtParam.org.buf;
325
0
  const Pel* piCur           = rcDtParam.cur.buf;
326
0
  const int  iCols           = rcDtParam.org.width;
327
0
        int  iRows           = rcDtParam.org.height;
328
0
  const int  iSubShift       = rcDtParam.subShift;
329
0
  const int  iSubStep        = ( 1 << iSubShift );
330
0
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
331
0
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
332
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
333
334
0
  Distortion uiSum = 0;
335
336
0
  for( ; iRows != 0; iRows -= iSubStep )
337
0
  {
338
0
    for (int n = 0; n < iCols; n++ )
339
0
    {
340
0
      uiSum += abs( piOrg[n] - piCur[n] );
341
0
    }
342
0
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
343
0
    {
344
0
      return ( uiSum >> distortionShift );
345
0
    }
346
0
    piOrg += iStrideOrg;
347
0
    piCur += iStrideCur;
348
0
  }
349
350
0
  uiSum <<= iSubShift;
351
0
  return ( uiSum >> distortionShift );
352
0
}
353
354
Distortion RdCost::xGetSAD4( const DistParam& rcDtParam )
355
120k
{
356
120k
  if ( rcDtParam.applyWeight )
357
0
  {
358
0
    THROW(" no support");
359
0
  }
360
361
120k
  const Pel* piOrg   = rcDtParam.org.buf;
362
120k
  const Pel* piCur   = rcDtParam.cur.buf;
363
120k
  int  iRows         = rcDtParam.org.height;
364
120k
  int  iSubShift     = rcDtParam.subShift;
365
120k
  int  iSubStep      = ( 1 << iSubShift );
366
120k
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
367
120k
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;
368
369
120k
  Distortion uiSum = 0;
370
371
1.48M
  for( ; iRows != 0; iRows -= iSubStep )
372
1.36M
  {
373
1.36M
    uiSum += abs( piOrg[0] - piCur[0] );
374
1.36M
    uiSum += abs( piOrg[1] - piCur[1] );
375
1.36M
    uiSum += abs( piOrg[2] - piCur[2] );
376
1.36M
    uiSum += abs( piOrg[3] - piCur[3] );
377
378
1.36M
    piOrg += iStrideOrg;
379
1.36M
    piCur += iStrideCur;
380
1.36M
  }
381
382
120k
  uiSum <<= iSubShift;
383
120k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
384
120k
}
385
386
Distortion RdCost::xGetSAD8( const DistParam& rcDtParam )
387
480k
{
388
480k
  if ( rcDtParam.applyWeight )
389
0
  {
390
0
    THROW(" no support");
391
0
  }
392
393
480k
  const Pel* piOrg      = rcDtParam.org.buf;
394
480k
  const Pel* piCur      = rcDtParam.cur.buf;
395
480k
  int  iRows            = rcDtParam.org.height;
396
480k
  int  iSubShift        = rcDtParam.subShift;
397
480k
  int  iSubStep         = ( 1 << iSubShift );
398
480k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
399
480k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
400
401
480k
  Distortion uiSum = 0;
402
403
10.0M
  for( ; iRows != 0; iRows-=iSubStep )
404
9.60M
  {
405
9.60M
    uiSum += abs( piOrg[0] - piCur[0] );
406
9.60M
    uiSum += abs( piOrg[1] - piCur[1] );
407
9.60M
    uiSum += abs( piOrg[2] - piCur[2] );
408
9.60M
    uiSum += abs( piOrg[3] - piCur[3] );
409
9.60M
    uiSum += abs( piOrg[4] - piCur[4] );
410
9.60M
    uiSum += abs( piOrg[5] - piCur[5] );
411
9.60M
    uiSum += abs( piOrg[6] - piCur[6] );
412
9.60M
    uiSum += abs( piOrg[7] - piCur[7] );
413
414
9.60M
    piOrg += iStrideOrg;
415
9.60M
    piCur += iStrideCur;
416
9.60M
  }
417
418
480k
  uiSum <<= iSubShift;
419
480k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
420
480k
}
421
422
Distortion RdCost::xGetSAD16( const DistParam& rcDtParam )
423
420k
{
424
420k
  if ( rcDtParam.applyWeight )
425
0
  {
426
0
    THROW(" no support");
427
0
  }
428
429
420k
  const Pel* piOrg      = rcDtParam.org.buf;
430
420k
  const Pel* piCur      = rcDtParam.cur.buf;
431
420k
  int  iRows            = rcDtParam.org.height;
432
420k
  int  iSubShift        = rcDtParam.subShift;
433
420k
  int  iSubStep         = ( 1 << iSubShift );
434
420k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
435
420k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
436
437
420k
  Distortion uiSum = 0;
438
439
9.25M
  for( ; iRows != 0; iRows -= iSubStep )
440
8.83M
  {
441
8.83M
    uiSum += abs( piOrg[0] - piCur[0] );
442
8.83M
    uiSum += abs( piOrg[1] - piCur[1] );
443
8.83M
    uiSum += abs( piOrg[2] - piCur[2] );
444
8.83M
    uiSum += abs( piOrg[3] - piCur[3] );
445
8.83M
    uiSum += abs( piOrg[4] - piCur[4] );
446
8.83M
    uiSum += abs( piOrg[5] - piCur[5] );
447
8.83M
    uiSum += abs( piOrg[6] - piCur[6] );
448
8.83M
    uiSum += abs( piOrg[7] - piCur[7] );
449
8.83M
    uiSum += abs( piOrg[8] - piCur[8] );
450
8.83M
    uiSum += abs( piOrg[9] - piCur[9] );
451
8.83M
    uiSum += abs( piOrg[10] - piCur[10] );
452
8.83M
    uiSum += abs( piOrg[11] - piCur[11] );
453
8.83M
    uiSum += abs( piOrg[12] - piCur[12] );
454
8.83M
    uiSum += abs( piOrg[13] - piCur[13] );
455
8.83M
    uiSum += abs( piOrg[14] - piCur[14] );
456
8.83M
    uiSum += abs( piOrg[15] - piCur[15] );
457
458
8.83M
    piOrg += iStrideOrg;
459
8.83M
    piCur += iStrideCur;
460
8.83M
  }
461
462
420k
  uiSum <<= iSubShift;
463
420k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
464
420k
}
465
466
467
Distortion RdCost::xGetSAD128( const DistParam &rcDtParam )
468
0
{
469
0
  const Pel* piOrg  = rcDtParam.org.buf;
470
0
  const Pel* piCur  = rcDtParam.cur.buf;
471
0
  int  iRows        = rcDtParam.org.height;
472
0
  int  iCols        = rcDtParam.org.width;
473
0
  int  iSubShift    = rcDtParam.subShift;
474
0
  int  iSubStep     = ( 1 << iSubShift );
475
0
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
476
0
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;
477
478
0
  Distortion uiSum = 0;
479
480
0
  for( ; iRows != 0; iRows-=iSubStep )
481
0
  {
482
0
    for (int n = 0; n < iCols; n+=16 )
483
0
    {
484
0
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
485
0
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
486
0
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
487
0
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
488
0
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
489
0
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
490
0
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
491
0
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
492
0
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
493
0
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
494
0
      uiSum += abs( piOrg[n+10] - piCur[n+10] );
495
0
      uiSum += abs( piOrg[n+11] - piCur[n+11] );
496
0
      uiSum += abs( piOrg[n+12] - piCur[n+12] );
497
0
      uiSum += abs( piOrg[n+13] - piCur[n+13] );
498
0
      uiSum += abs( piOrg[n+14] - piCur[n+14] );
499
0
      uiSum += abs( piOrg[n+15] - piCur[n+15] );
500
0
    }
501
0
    piOrg += iStrideOrg;
502
0
    piCur += iStrideCur;
503
0
  }
504
505
0
  uiSum <<= iSubShift;
506
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
507
0
}
508
509
Distortion RdCost::xGetSAD32( const DistParam &rcDtParam )
510
632k
{
511
632k
  if ( rcDtParam.applyWeight )
512
0
  {
513
0
    THROW(" no support");
514
0
  }
515
516
632k
  const Pel* piOrg      = rcDtParam.org.buf;
517
632k
  const Pel* piCur      = rcDtParam.cur.buf;
518
632k
  int  iRows            = rcDtParam.org.height;
519
632k
  int  iSubShift        = rcDtParam.subShift;
520
632k
  int  iSubStep         = ( 1 << iSubShift );
521
632k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
522
632k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
523
524
632k
  Distortion uiSum = 0;
525
526
12.4M
  for( ; iRows != 0; iRows-=iSubStep )
527
11.8M
  {
528
11.8M
    uiSum += abs( piOrg[0] - piCur[0] );
529
11.8M
    uiSum += abs( piOrg[1] - piCur[1] );
530
11.8M
    uiSum += abs( piOrg[2] - piCur[2] );
531
11.8M
    uiSum += abs( piOrg[3] - piCur[3] );
532
11.8M
    uiSum += abs( piOrg[4] - piCur[4] );
533
11.8M
    uiSum += abs( piOrg[5] - piCur[5] );
534
11.8M
    uiSum += abs( piOrg[6] - piCur[6] );
535
11.8M
    uiSum += abs( piOrg[7] - piCur[7] );
536
11.8M
    uiSum += abs( piOrg[8] - piCur[8] );
537
11.8M
    uiSum += abs( piOrg[9] - piCur[9] );
538
11.8M
    uiSum += abs( piOrg[10] - piCur[10] );
539
11.8M
    uiSum += abs( piOrg[11] - piCur[11] );
540
11.8M
    uiSum += abs( piOrg[12] - piCur[12] );
541
11.8M
    uiSum += abs( piOrg[13] - piCur[13] );
542
11.8M
    uiSum += abs( piOrg[14] - piCur[14] );
543
11.8M
    uiSum += abs( piOrg[15] - piCur[15] );
544
11.8M
    uiSum += abs( piOrg[16] - piCur[16] );
545
11.8M
    uiSum += abs( piOrg[17] - piCur[17] );
546
11.8M
    uiSum += abs( piOrg[18] - piCur[18] );
547
11.8M
    uiSum += abs( piOrg[19] - piCur[19] );
548
11.8M
    uiSum += abs( piOrg[20] - piCur[20] );
549
11.8M
    uiSum += abs( piOrg[21] - piCur[21] );
550
11.8M
    uiSum += abs( piOrg[22] - piCur[22] );
551
11.8M
    uiSum += abs( piOrg[23] - piCur[23] );
552
11.8M
    uiSum += abs( piOrg[24] - piCur[24] );
553
11.8M
    uiSum += abs( piOrg[25] - piCur[25] );
554
11.8M
    uiSum += abs( piOrg[26] - piCur[26] );
555
11.8M
    uiSum += abs( piOrg[27] - piCur[27] );
556
11.8M
    uiSum += abs( piOrg[28] - piCur[28] );
557
11.8M
    uiSum += abs( piOrg[29] - piCur[29] );
558
11.8M
    uiSum += abs( piOrg[30] - piCur[30] );
559
11.8M
    uiSum += abs( piOrg[31] - piCur[31] );
560
561
11.8M
    piOrg += iStrideOrg;
562
11.8M
    piCur += iStrideCur;
563
11.8M
  }
564
565
632k
  uiSum <<= iSubShift;
566
632k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
567
632k
}
568
569
570
Distortion RdCost::xGetSAD64( const DistParam &rcDtParam )
571
7.84k
{
572
7.84k
  if ( rcDtParam.applyWeight )
573
0
  {
574
0
    THROW(" no support");
575
0
  }
576
577
7.84k
  const Pel* piOrg      = rcDtParam.org.buf;
578
7.84k
  const Pel* piCur      = rcDtParam.cur.buf;
579
7.84k
  int  iRows            = rcDtParam.org.height;
580
7.84k
  int  iSubShift        = rcDtParam.subShift;
581
7.84k
  int  iSubStep         = ( 1 << iSubShift );
582
7.84k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
583
7.84k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
584
585
7.84k
  Distortion uiSum = 0;
586
587
509k
  for( ; iRows != 0; iRows-=iSubStep )
588
502k
  {
589
502k
    uiSum += abs( piOrg[0] - piCur[0] );
590
502k
    uiSum += abs( piOrg[1] - piCur[1] );
591
502k
    uiSum += abs( piOrg[2] - piCur[2] );
592
502k
    uiSum += abs( piOrg[3] - piCur[3] );
593
502k
    uiSum += abs( piOrg[4] - piCur[4] );
594
502k
    uiSum += abs( piOrg[5] - piCur[5] );
595
502k
    uiSum += abs( piOrg[6] - piCur[6] );
596
502k
    uiSum += abs( piOrg[7] - piCur[7] );
597
502k
    uiSum += abs( piOrg[8] - piCur[8] );
598
502k
    uiSum += abs( piOrg[9] - piCur[9] );
599
502k
    uiSum += abs( piOrg[10] - piCur[10] );
600
502k
    uiSum += abs( piOrg[11] - piCur[11] );
601
502k
    uiSum += abs( piOrg[12] - piCur[12] );
602
502k
    uiSum += abs( piOrg[13] - piCur[13] );
603
502k
    uiSum += abs( piOrg[14] - piCur[14] );
604
502k
    uiSum += abs( piOrg[15] - piCur[15] );
605
502k
    uiSum += abs( piOrg[16] - piCur[16] );
606
502k
    uiSum += abs( piOrg[17] - piCur[17] );
607
502k
    uiSum += abs( piOrg[18] - piCur[18] );
608
502k
    uiSum += abs( piOrg[19] - piCur[19] );
609
502k
    uiSum += abs( piOrg[20] - piCur[20] );
610
502k
    uiSum += abs( piOrg[21] - piCur[21] );
611
502k
    uiSum += abs( piOrg[22] - piCur[22] );
612
502k
    uiSum += abs( piOrg[23] - piCur[23] );
613
502k
    uiSum += abs( piOrg[24] - piCur[24] );
614
502k
    uiSum += abs( piOrg[25] - piCur[25] );
615
502k
    uiSum += abs( piOrg[26] - piCur[26] );
616
502k
    uiSum += abs( piOrg[27] - piCur[27] );
617
502k
    uiSum += abs( piOrg[28] - piCur[28] );
618
502k
    uiSum += abs( piOrg[29] - piCur[29] );
619
502k
    uiSum += abs( piOrg[30] - piCur[30] );
620
502k
    uiSum += abs( piOrg[31] - piCur[31] );
621
502k
    uiSum += abs( piOrg[32] - piCur[32] );
622
502k
    uiSum += abs( piOrg[33] - piCur[33] );
623
502k
    uiSum += abs( piOrg[34] - piCur[34] );
624
502k
    uiSum += abs( piOrg[35] - piCur[35] );
625
502k
    uiSum += abs( piOrg[36] - piCur[36] );
626
502k
    uiSum += abs( piOrg[37] - piCur[37] );
627
502k
    uiSum += abs( piOrg[38] - piCur[38] );
628
502k
    uiSum += abs( piOrg[39] - piCur[39] );
629
502k
    uiSum += abs( piOrg[40] - piCur[40] );
630
502k
    uiSum += abs( piOrg[41] - piCur[41] );
631
502k
    uiSum += abs( piOrg[42] - piCur[42] );
632
502k
    uiSum += abs( piOrg[43] - piCur[43] );
633
502k
    uiSum += abs( piOrg[44] - piCur[44] );
634
502k
    uiSum += abs( piOrg[45] - piCur[45] );
635
502k
    uiSum += abs( piOrg[46] - piCur[46] );
636
502k
    uiSum += abs( piOrg[47] - piCur[47] );
637
502k
    uiSum += abs( piOrg[48] - piCur[48] );
638
502k
    uiSum += abs( piOrg[49] - piCur[49] );
639
502k
    uiSum += abs( piOrg[50] - piCur[50] );
640
502k
    uiSum += abs( piOrg[51] - piCur[51] );
641
502k
    uiSum += abs( piOrg[52] - piCur[52] );
642
502k
    uiSum += abs( piOrg[53] - piCur[53] );
643
502k
    uiSum += abs( piOrg[54] - piCur[54] );
644
502k
    uiSum += abs( piOrg[55] - piCur[55] );
645
502k
    uiSum += abs( piOrg[56] - piCur[56] );
646
502k
    uiSum += abs( piOrg[57] - piCur[57] );
647
502k
    uiSum += abs( piOrg[58] - piCur[58] );
648
502k
    uiSum += abs( piOrg[59] - piCur[59] );
649
502k
    uiSum += abs( piOrg[60] - piCur[60] );
650
502k
    uiSum += abs( piOrg[61] - piCur[61] );
651
502k
    uiSum += abs( piOrg[62] - piCur[62] );
652
502k
    uiSum += abs( piOrg[63] - piCur[63] );
653
654
502k
    piOrg += iStrideOrg;
655
502k
    piCur += iStrideCur;
656
502k
  }
657
658
7.84k
  uiSum <<= iSubShift;
659
7.84k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
660
7.84k
}
661
662
663
// --------------------------------------------------------------------------------------------------------------------
664
// SSE
665
// --------------------------------------------------------------------------------------------------------------------
666
667
Distortion RdCost::xGetSSE( const DistParam &rcDtParam )
668
0
{
669
0
  if ( rcDtParam.applyWeight )
670
0
  {
671
0
    THROW(" no support");
672
0
  }
673
674
0
  const Pel* piOrg      = rcDtParam.org.buf;
675
0
  const Pel* piCur      = rcDtParam.cur.buf;
676
0
  int  iRows            = rcDtParam.org.height;
677
0
  int  iCols            = rcDtParam.org.width;
678
0
  int  iStrideCur       = rcDtParam.cur.stride;
679
0
  int  iStrideOrg       = rcDtParam.org.stride;
680
681
0
  Distortion uiSum   = 0;
682
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
683
684
0
  Intermediate_Int iTemp;
685
686
0
  for( ; iRows != 0; iRows-- )
687
0
  {
688
0
    for (int n = 0; n < iCols; n++ )
689
0
    {
690
0
      iTemp = piOrg[n  ] - piCur[n  ];
691
0
      uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
692
0
    }
693
0
    piOrg += iStrideOrg;
694
0
    piCur += iStrideCur;
695
0
  }
696
697
0
  return ( uiSum );
698
0
}
699
700
Distortion RdCost::xGetSSE4( const DistParam &rcDtParam )
701
513k
{
702
513k
  if ( rcDtParam.applyWeight )
703
0
  {
704
0
    CHECK( rcDtParam.org.width != 4, "Invalid size" );
705
0
    THROW(" no support");
706
0
  }
707
708
513k
  const Pel* piOrg   = rcDtParam.org.buf;
709
513k
  const Pel* piCur   = rcDtParam.cur.buf;
710
513k
  int  iRows         = rcDtParam.org.height;
711
513k
  int  iStrideOrg    = rcDtParam.org.stride;
712
513k
  int  iStrideCur    = rcDtParam.cur.stride;
713
714
513k
  Distortion uiSum   = 0;
715
513k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
716
717
513k
  Intermediate_Int  iTemp;
718
719
7.19M
  for( ; iRows != 0; iRows-- )
720
6.68M
  {
721
722
6.68M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
723
6.68M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
724
6.68M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
725
6.68M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
726
727
6.68M
    piOrg += iStrideOrg;
728
6.68M
    piCur += iStrideCur;
729
6.68M
  }
730
731
513k
  return ( uiSum );
732
513k
}
733
734
Distortion RdCost::xGetSSE8( const DistParam &rcDtParam )
735
774k
{
736
774k
  if ( rcDtParam.applyWeight )
737
0
  {
738
0
    CHECK( rcDtParam.org.width != 8, "Invalid size" );
739
0
    THROW(" no support");
740
0
  }
741
742
774k
  const Pel* piOrg   = rcDtParam.org.buf;
743
774k
  const Pel* piCur   = rcDtParam.cur.buf;
744
774k
  int  iRows         = rcDtParam.org.height;
745
774k
  int  iStrideOrg    = rcDtParam.org.stride;
746
774k
  int  iStrideCur    = rcDtParam.cur.stride;
747
748
774k
  Distortion uiSum   = 0;
749
774k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
750
751
774k
  Intermediate_Int  iTemp;
752
753
11.9M
  for( ; iRows != 0; iRows-- )
754
11.2M
  {
755
11.2M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
756
11.2M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
757
11.2M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
758
11.2M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
759
11.2M
    iTemp = piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
760
11.2M
    iTemp = piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
761
11.2M
    iTemp = piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
762
11.2M
    iTemp = piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
763
764
11.2M
    piOrg += iStrideOrg;
765
11.2M
    piCur += iStrideCur;
766
11.2M
  }
767
768
774k
  return ( uiSum );
769
774k
}
770
771
Distortion RdCost::xGetSSE16( const DistParam &rcDtParam )
772
630k
{
773
630k
  if ( rcDtParam.applyWeight )
774
0
  {
775
0
    CHECK( rcDtParam.org.width != 16, "Invalid size" );
776
0
    THROW(" no support");
777
0
  }
778
779
630k
  const Pel* piOrg   = rcDtParam.org.buf;
780
630k
  const Pel* piCur   = rcDtParam.cur.buf;
781
630k
  int  iRows         = rcDtParam.org.height;
782
630k
  int  iStrideOrg    = rcDtParam.org.stride;
783
630k
  int  iStrideCur    = rcDtParam.cur.stride;
784
785
630k
  Distortion uiSum   = 0;
786
630k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
787
788
630k
  Intermediate_Int  iTemp;
789
790
9.75M
  for( ; iRows != 0; iRows-- )
791
9.12M
  {
792
793
9.12M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
794
9.12M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
795
9.12M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
796
9.12M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
797
9.12M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
798
9.12M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
799
9.12M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
800
9.12M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
801
9.12M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
802
9.12M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
803
9.12M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
804
9.12M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
805
9.12M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
806
9.12M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
807
9.12M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
808
9.12M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
809
810
9.12M
    piOrg += iStrideOrg;
811
9.12M
    piCur += iStrideCur;
812
9.12M
  }
813
814
630k
  return ( uiSum );
815
630k
}
816
817
Distortion RdCost::xGetSSE128( const DistParam &rcDtParam )
818
0
{
819
0
  if ( rcDtParam.applyWeight )
820
0
  {
821
0
    THROW(" no support");
822
0
  }
823
0
  const Pel* piOrg   = rcDtParam.org.buf;
824
0
  const Pel* piCur   = rcDtParam.cur.buf;
825
0
  int  iRows         = rcDtParam.org.height;
826
0
  int  iCols         = rcDtParam.org.width;
827
0
  int  iStrideOrg    = rcDtParam.org.stride;
828
0
  int  iStrideCur    = rcDtParam.cur.stride;
829
830
0
  Distortion uiSum   = 0;
831
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
832
833
0
  Intermediate_Int  iTemp;
834
835
0
  for( ; iRows != 0; iRows-- )
836
0
  {
837
0
    for (int n = 0; n < iCols; n+=16 )
838
0
    {
839
840
0
      iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
841
0
      iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
842
0
      iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
843
0
      iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
844
0
      iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
845
0
      iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
846
0
      iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
847
0
      iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
848
0
      iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
849
0
      iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
850
0
      iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
851
0
      iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
852
0
      iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
853
0
      iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
854
0
      iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
855
0
      iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
856
857
0
    }
858
0
    piOrg += iStrideOrg;
859
0
    piCur += iStrideCur;
860
0
  }
861
862
0
  return ( uiSum );
863
0
}
864
865
Distortion RdCost::xGetSSE32( const DistParam &rcDtParam )
866
518k
{
867
518k
  if ( rcDtParam.applyWeight )
868
0
  {
869
0
    THROW(" no support");
870
0
  }
871
872
518k
  const Pel* piOrg   = rcDtParam.org.buf;
873
518k
  const Pel* piCur   = rcDtParam.cur.buf;
874
518k
  int  iRows         = rcDtParam.org.height;
875
518k
  int  iStrideOrg    = rcDtParam.org.stride;
876
518k
  int  iStrideCur    = rcDtParam.cur.stride;
877
878
518k
  Distortion uiSum   = 0;
879
518k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
880
881
518k
  Intermediate_Int  iTemp;
882
883
9.07M
  for( ; iRows != 0; iRows-- )
884
8.56M
  {
885
886
8.56M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
887
8.56M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
888
8.56M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
889
8.56M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
890
8.56M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
891
8.56M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
892
8.56M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
893
8.56M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
894
8.56M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
895
8.56M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
896
8.56M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
897
8.56M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
898
8.56M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
899
8.56M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
900
8.56M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
901
8.56M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
902
8.56M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
903
8.56M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
904
8.56M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
905
8.56M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
906
8.56M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
907
8.56M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
908
8.56M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
909
8.56M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
910
8.56M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
911
8.56M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
912
8.56M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
913
8.56M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
914
8.56M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
915
8.56M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
916
8.56M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
917
8.56M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
918
919
8.56M
    piOrg += iStrideOrg;
920
8.56M
    piCur += iStrideCur;
921
8.56M
  }
922
923
518k
  return ( uiSum );
924
518k
}
925
926
Distortion RdCost::xGetSSE64( const DistParam &rcDtParam )
927
61.5k
{
928
61.5k
  if ( rcDtParam.applyWeight )
929
0
  {
930
0
    THROW(" no support");
931
0
  }
932
933
61.5k
  const Pel* piOrg   = rcDtParam.org.buf;
934
61.5k
  const Pel* piCur   = rcDtParam.cur.buf;
935
61.5k
  int  iRows         = rcDtParam.org.height;
936
61.5k
  int  iStrideOrg    = rcDtParam.org.stride;
937
61.5k
  int  iStrideCur    = rcDtParam.cur.stride;
938
939
61.5k
  Distortion uiSum   = 0;
940
61.5k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
941
942
61.5k
  Intermediate_Int  iTemp;
943
944
3.26M
  for( ; iRows != 0; iRows-- )
945
3.20M
  {
946
3.20M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
947
3.20M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
948
3.20M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
949
3.20M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
950
3.20M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
951
3.20M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
952
3.20M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
953
3.20M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
954
3.20M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
955
3.20M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
956
3.20M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
957
3.20M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
958
3.20M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
959
3.20M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
960
3.20M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
961
3.20M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
962
3.20M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
963
3.20M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
964
3.20M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
965
3.20M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
966
3.20M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
967
3.20M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
968
3.20M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
969
3.20M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
970
3.20M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
971
3.20M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
972
3.20M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
973
3.20M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
974
3.20M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
975
3.20M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
976
3.20M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
977
3.20M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
978
3.20M
    iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
979
3.20M
    iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
980
3.20M
    iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
981
3.20M
    iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
982
3.20M
    iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
983
3.20M
    iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
984
3.20M
    iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
985
3.20M
    iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
986
3.20M
    iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
987
3.20M
    iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
988
3.20M
    iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
989
3.20M
    iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
990
3.20M
    iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
991
3.20M
    iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
992
3.20M
    iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
993
3.20M
    iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
994
3.20M
    iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
995
3.20M
    iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
996
3.20M
    iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
997
3.20M
    iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
998
3.20M
    iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
999
3.20M
    iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1000
3.20M
    iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1001
3.20M
    iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1002
3.20M
    iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1003
3.20M
    iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1004
3.20M
    iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1005
3.20M
    iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1006
3.20M
    iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1007
3.20M
    iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1008
3.20M
    iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1009
3.20M
    iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1010
1011
3.20M
    piOrg += iStrideOrg;
1012
3.20M
    piCur += iStrideCur;
1013
3.20M
  }
1014
1015
61.5k
  return ( uiSum );
1016
61.5k
}
1017
1018
// --------------------------------------------------------------------------------------------------------------------
1019
// HADAMARD with step (used in fractional search)
1020
// --------------------------------------------------------------------------------------------------------------------
1021
1022
Distortion RdCost::xCalcHADs2x2( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1023
148k
{
1024
148k
  Distortion satd = 0;
1025
148k
  TCoeff diff[4], m[4];
1026
1027
148k
  diff[0] = piOrg[0             ] - piCur[0];
1028
148k
  diff[1] = piOrg[1             ] - piCur[1];
1029
148k
  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
1030
148k
  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
1031
148k
  m[0] = diff[0] + diff[2];
1032
148k
  m[1] = diff[1] + diff[3];
1033
148k
  m[2] = diff[0] - diff[2];
1034
148k
  m[3] = diff[1] - diff[3];
1035
  
1036
148k
  satd += abs(m[0] + m[1]) >> 2;
1037
148k
  satd += abs(m[0] - m[1]);
1038
148k
  satd += abs(m[2] + m[3]);
1039
148k
  satd += abs(m[2] - m[3]);
1040
1041
148k
  return satd;
1042
148k
}
1043
1044
static Distortion xCalcHADs4x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1045
40.5k
{
1046
40.5k
  int k;
1047
40.5k
  Distortion satd = 0;
1048
40.5k
  TCoeff diff[16], m[16], d[16];
1049
1050
202k
  for( k = 0; k < 16; k+=4 )
1051
162k
  {
1052
162k
    diff[k+0] = piOrg[0] - piCur[0];
1053
162k
    diff[k+1] = piOrg[1] - piCur[1];
1054
162k
    diff[k+2] = piOrg[2] - piCur[2];
1055
162k
    diff[k+3] = piOrg[3] - piCur[3];
1056
1057
162k
    piCur += iStrideCur;
1058
162k
    piOrg += iStrideOrg;
1059
162k
  }
1060
1061
  /*===== hadamard transform =====*/
1062
40.5k
  m[ 0] = diff[ 0] + diff[12];
1063
40.5k
  m[ 1] = diff[ 1] + diff[13];
1064
40.5k
  m[ 2] = diff[ 2] + diff[14];
1065
40.5k
  m[ 3] = diff[ 3] + diff[15];
1066
40.5k
  m[ 4] = diff[ 4] + diff[ 8];
1067
40.5k
  m[ 5] = diff[ 5] + diff[ 9];
1068
40.5k
  m[ 6] = diff[ 6] + diff[10];
1069
40.5k
  m[ 7] = diff[ 7] + diff[11];
1070
40.5k
  m[ 8] = diff[ 4] - diff[ 8];
1071
40.5k
  m[ 9] = diff[ 5] - diff[ 9];
1072
40.5k
  m[10] = diff[ 6] - diff[10];
1073
40.5k
  m[11] = diff[ 7] - diff[11];
1074
40.5k
  m[12] = diff[ 0] - diff[12];
1075
40.5k
  m[13] = diff[ 1] - diff[13];
1076
40.5k
  m[14] = diff[ 2] - diff[14];
1077
40.5k
  m[15] = diff[ 3] - diff[15];
1078
1079
40.5k
  d[ 0] = m[ 0] + m[ 4];
1080
40.5k
  d[ 1] = m[ 1] + m[ 5];
1081
40.5k
  d[ 2] = m[ 2] + m[ 6];
1082
40.5k
  d[ 3] = m[ 3] + m[ 7];
1083
40.5k
  d[ 4] = m[ 8] + m[12];
1084
40.5k
  d[ 5] = m[ 9] + m[13];
1085
40.5k
  d[ 6] = m[10] + m[14];
1086
40.5k
  d[ 7] = m[11] + m[15];
1087
40.5k
  d[ 8] = m[ 0] - m[ 4];
1088
40.5k
  d[ 9] = m[ 1] - m[ 5];
1089
40.5k
  d[10] = m[ 2] - m[ 6];
1090
40.5k
  d[11] = m[ 3] - m[ 7];
1091
40.5k
  d[12] = m[12] - m[ 8];
1092
40.5k
  d[13] = m[13] - m[ 9];
1093
40.5k
  d[14] = m[14] - m[10];
1094
40.5k
  d[15] = m[15] - m[11];
1095
1096
40.5k
  m[ 0] = d[ 0] + d[ 3];
1097
40.5k
  m[ 1] = d[ 1] + d[ 2];
1098
40.5k
  m[ 2] = d[ 1] - d[ 2];
1099
40.5k
  m[ 3] = d[ 0] - d[ 3];
1100
40.5k
  m[ 4] = d[ 4] + d[ 7];
1101
40.5k
  m[ 5] = d[ 5] + d[ 6];
1102
40.5k
  m[ 6] = d[ 5] - d[ 6];
1103
40.5k
  m[ 7] = d[ 4] - d[ 7];
1104
40.5k
  m[ 8] = d[ 8] + d[11];
1105
40.5k
  m[ 9] = d[ 9] + d[10];
1106
40.5k
  m[10] = d[ 9] - d[10];
1107
40.5k
  m[11] = d[ 8] - d[11];
1108
40.5k
  m[12] = d[12] + d[15];
1109
40.5k
  m[13] = d[13] + d[14];
1110
40.5k
  m[14] = d[13] - d[14];
1111
40.5k
  m[15] = d[12] - d[15];
1112
1113
40.5k
  d[ 0] = m[ 0] + m[ 1];
1114
40.5k
  d[ 1] = m[ 0] - m[ 1];
1115
40.5k
  d[ 2] = m[ 2] + m[ 3];
1116
40.5k
  d[ 3] = m[ 3] - m[ 2];
1117
40.5k
  d[ 4] = m[ 4] + m[ 5];
1118
40.5k
  d[ 5] = m[ 4] - m[ 5];
1119
40.5k
  d[ 6] = m[ 6] + m[ 7];
1120
40.5k
  d[ 7] = m[ 7] - m[ 6];
1121
40.5k
  d[ 8] = m[ 8] + m[ 9];
1122
40.5k
  d[ 9] = m[ 8] - m[ 9];
1123
40.5k
  d[10] = m[10] + m[11];
1124
40.5k
  d[11] = m[11] - m[10];
1125
40.5k
  d[12] = m[12] + m[13];
1126
40.5k
  d[13] = m[12] - m[13];
1127
40.5k
  d[14] = m[14] + m[15];
1128
40.5k
  d[15] = m[15] - m[14];
1129
1130
689k
  for (k=0; k<16; ++k)
1131
648k
  {
1132
648k
    satd += abs(d[k]);
1133
648k
  }
1134
1135
40.5k
  satd -= abs( d[0] );
1136
40.5k
  satd += abs( d[0] ) >> 2;
1137
40.5k
  satd = ((satd+1)>>1);
1138
1139
40.5k
  return satd;
1140
40.5k
}
1141
1142
static Distortion xCalcHADs16x16_fast( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1143
0
{
1144
0
  int k, i, j, jj;
1145
0
  Distortion sad = 0;
1146
0
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1147
1148
0
  for( k = 0; k < 64; k += 8 )
1149
0
  {
1150
0
    diff[k+0] = ( ( piOrg[ 0] + piOrg[ 0+1] + piOrg[ 0+iStrideOrg] + piOrg[ 0+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 0] + piCur[ 0+1] + piCur[ 0+iStrideCur] + piCur[ 0+1+iStrideCur] + 2 ) >> 2 );
1151
0
    diff[k+1] = ( ( piOrg[ 2] + piOrg[ 2+1] + piOrg[ 2+iStrideOrg] + piOrg[ 2+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 2] + piCur[ 2+1] + piCur[ 2+iStrideCur] + piCur[ 2+1+iStrideCur] + 2 ) >> 2 );
1152
0
    diff[k+2] = ( ( piOrg[ 4] + piOrg[ 4+1] + piOrg[ 4+iStrideOrg] + piOrg[ 4+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 4] + piCur[ 4+1] + piCur[ 4+iStrideCur] + piCur[ 4+1+iStrideCur] + 2 ) >> 2 );
1153
0
    diff[k+3] = ( ( piOrg[ 6] + piOrg[ 6+1] + piOrg[ 6+iStrideOrg] + piOrg[ 6+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 6] + piCur[ 6+1] + piCur[ 6+iStrideCur] + piCur[ 6+1+iStrideCur] + 2 ) >> 2 );
1154
0
    diff[k+4] = ( ( piOrg[ 8] + piOrg[ 8+1] + piOrg[ 8+iStrideOrg] + piOrg[ 8+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 8] + piCur[ 8+1] + piCur[ 8+iStrideCur] + piCur[ 8+1+iStrideCur] + 2 ) >> 2 );
1155
0
    diff[k+5] = ( ( piOrg[10] + piOrg[10+1] + piOrg[10+iStrideOrg] + piOrg[10+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[10] + piCur[10+1] + piCur[10+iStrideCur] + piCur[10+1+iStrideCur] + 2 ) >> 2 );
1156
0
    diff[k+6] = ( ( piOrg[12] + piOrg[12+1] + piOrg[12+iStrideOrg] + piOrg[12+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[12] + piCur[12+1] + piCur[12+iStrideCur] + piCur[12+1+iStrideCur] + 2 ) >> 2 );
1157
0
    diff[k+7] = ( ( piOrg[14] + piOrg[14+1] + piOrg[14+iStrideOrg] + piOrg[14+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[14] + piCur[14+1] + piCur[14+iStrideCur] + piCur[14+1+iStrideCur] + 2 ) >> 2 );
1158
1159
0
    piCur += 2 * iStrideCur;
1160
0
    piOrg += 2 * iStrideOrg;
1161
0
  }
1162
1163
  //horizontal
1164
0
  for (j=0; j < 8; j++)
1165
0
  {
1166
0
    jj = j << 3;
1167
0
    m2[j][0] = diff[jj  ] + diff[jj+4];
1168
0
    m2[j][1] = diff[jj+1] + diff[jj+5];
1169
0
    m2[j][2] = diff[jj+2] + diff[jj+6];
1170
0
    m2[j][3] = diff[jj+3] + diff[jj+7];
1171
0
    m2[j][4] = diff[jj  ] - diff[jj+4];
1172
0
    m2[j][5] = diff[jj+1] - diff[jj+5];
1173
0
    m2[j][6] = diff[jj+2] - diff[jj+6];
1174
0
    m2[j][7] = diff[jj+3] - diff[jj+7];
1175
1176
0
    m1[j][0] = m2[j][0] + m2[j][2];
1177
0
    m1[j][1] = m2[j][1] + m2[j][3];
1178
0
    m1[j][2] = m2[j][0] - m2[j][2];
1179
0
    m1[j][3] = m2[j][1] - m2[j][3];
1180
0
    m1[j][4] = m2[j][4] + m2[j][6];
1181
0
    m1[j][5] = m2[j][5] + m2[j][7];
1182
0
    m1[j][6] = m2[j][4] - m2[j][6];
1183
0
    m1[j][7] = m2[j][5] - m2[j][7];
1184
1185
0
    m2[j][0] = m1[j][0] + m1[j][1];
1186
0
    m2[j][1] = m1[j][0] - m1[j][1];
1187
0
    m2[j][2] = m1[j][2] + m1[j][3];
1188
0
    m2[j][3] = m1[j][2] - m1[j][3];
1189
0
    m2[j][4] = m1[j][4] + m1[j][5];
1190
0
    m2[j][5] = m1[j][4] - m1[j][5];
1191
0
    m2[j][6] = m1[j][6] + m1[j][7];
1192
0
    m2[j][7] = m1[j][6] - m1[j][7];
1193
0
  }
1194
1195
  //vertical
1196
0
  for (i=0; i < 8; i++)
1197
0
  {
1198
0
    m3[0][i] = m2[0][i] + m2[4][i];
1199
0
    m3[1][i] = m2[1][i] + m2[5][i];
1200
0
    m3[2][i] = m2[2][i] + m2[6][i];
1201
0
    m3[3][i] = m2[3][i] + m2[7][i];
1202
0
    m3[4][i] = m2[0][i] - m2[4][i];
1203
0
    m3[5][i] = m2[1][i] - m2[5][i];
1204
0
    m3[6][i] = m2[2][i] - m2[6][i];
1205
0
    m3[7][i] = m2[3][i] - m2[7][i];
1206
1207
0
    m1[0][i] = m3[0][i] + m3[2][i];
1208
0
    m1[1][i] = m3[1][i] + m3[3][i];
1209
0
    m1[2][i] = m3[0][i] - m3[2][i];
1210
0
    m1[3][i] = m3[1][i] - m3[3][i];
1211
0
    m1[4][i] = m3[4][i] + m3[6][i];
1212
0
    m1[5][i] = m3[5][i] + m3[7][i];
1213
0
    m1[6][i] = m3[4][i] - m3[6][i];
1214
0
    m1[7][i] = m3[5][i] - m3[7][i];
1215
1216
0
    m2[0][i] = m1[0][i] + m1[1][i];
1217
0
    m2[1][i] = m1[0][i] - m1[1][i];
1218
0
    m2[2][i] = m1[2][i] + m1[3][i];
1219
0
    m2[3][i] = m1[2][i] - m1[3][i];
1220
0
    m2[4][i] = m1[4][i] + m1[5][i];
1221
0
    m2[5][i] = m1[4][i] - m1[5][i];
1222
0
    m2[6][i] = m1[6][i] + m1[7][i];
1223
0
    m2[7][i] = m1[6][i] - m1[7][i];
1224
0
  }
1225
1226
0
  for (i = 0; i < 8; i++)
1227
0
  {
1228
0
    for (j = 0; j < 8; j++)
1229
0
    {
1230
0
      sad += abs(m2[i][j]);
1231
0
    }
1232
0
  }
1233
  
1234
0
  sad -= abs( m2[0][0] );
1235
0
  sad += abs( m2[0][0] ) >> 2;
1236
0
  sad=((sad+2)>>2);
1237
1238
0
  return (sad << 2);
1239
0
}
1240
1241
static Distortion xCalcHADs8x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1242
14.5M
{
1243
14.5M
  int k, i, j, jj;
1244
14.5M
  Distortion sad = 0;
1245
14.5M
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1246
1247
130M
  for( k = 0; k < 64; k += 8 )
1248
116M
  {
1249
116M
    diff[k+0] = piOrg[0] - piCur[0];
1250
116M
    diff[k+1] = piOrg[1] - piCur[1];
1251
116M
    diff[k+2] = piOrg[2] - piCur[2];
1252
116M
    diff[k+3] = piOrg[3] - piCur[3];
1253
116M
    diff[k+4] = piOrg[4] - piCur[4];
1254
116M
    diff[k+5] = piOrg[5] - piCur[5];
1255
116M
    diff[k+6] = piOrg[6] - piCur[6];
1256
116M
    diff[k+7] = piOrg[7] - piCur[7];
1257
1258
116M
    piCur += iStrideCur;
1259
116M
    piOrg += iStrideOrg;
1260
116M
  }
1261
1262
  //horizontal
1263
130M
  for (j=0; j < 8; j++)
1264
116M
  {
1265
116M
    jj = j << 3;
1266
116M
    m2[j][0] = diff[jj  ] + diff[jj+4];
1267
116M
    m2[j][1] = diff[jj+1] + diff[jj+5];
1268
116M
    m2[j][2] = diff[jj+2] + diff[jj+6];
1269
116M
    m2[j][3] = diff[jj+3] + diff[jj+7];
1270
116M
    m2[j][4] = diff[jj  ] - diff[jj+4];
1271
116M
    m2[j][5] = diff[jj+1] - diff[jj+5];
1272
116M
    m2[j][6] = diff[jj+2] - diff[jj+6];
1273
116M
    m2[j][7] = diff[jj+3] - diff[jj+7];
1274
1275
116M
    m1[j][0] = m2[j][0] + m2[j][2];
1276
116M
    m1[j][1] = m2[j][1] + m2[j][3];
1277
116M
    m1[j][2] = m2[j][0] - m2[j][2];
1278
116M
    m1[j][3] = m2[j][1] - m2[j][3];
1279
116M
    m1[j][4] = m2[j][4] + m2[j][6];
1280
116M
    m1[j][5] = m2[j][5] + m2[j][7];
1281
116M
    m1[j][6] = m2[j][4] - m2[j][6];
1282
116M
    m1[j][7] = m2[j][5] - m2[j][7];
1283
1284
116M
    m2[j][0] = m1[j][0] + m1[j][1];
1285
116M
    m2[j][1] = m1[j][0] - m1[j][1];
1286
116M
    m2[j][2] = m1[j][2] + m1[j][3];
1287
116M
    m2[j][3] = m1[j][2] - m1[j][3];
1288
116M
    m2[j][4] = m1[j][4] + m1[j][5];
1289
116M
    m2[j][5] = m1[j][4] - m1[j][5];
1290
116M
    m2[j][6] = m1[j][6] + m1[j][7];
1291
116M
    m2[j][7] = m1[j][6] - m1[j][7];
1292
116M
  }
1293
1294
  //vertical
1295
130M
  for (i=0; i < 8; i++)
1296
116M
  {
1297
116M
    m3[0][i] = m2[0][i] + m2[4][i];
1298
116M
    m3[1][i] = m2[1][i] + m2[5][i];
1299
116M
    m3[2][i] = m2[2][i] + m2[6][i];
1300
116M
    m3[3][i] = m2[3][i] + m2[7][i];
1301
116M
    m3[4][i] = m2[0][i] - m2[4][i];
1302
116M
    m3[5][i] = m2[1][i] - m2[5][i];
1303
116M
    m3[6][i] = m2[2][i] - m2[6][i];
1304
116M
    m3[7][i] = m2[3][i] - m2[7][i];
1305
1306
116M
    m1[0][i] = m3[0][i] + m3[2][i];
1307
116M
    m1[1][i] = m3[1][i] + m3[3][i];
1308
116M
    m1[2][i] = m3[0][i] - m3[2][i];
1309
116M
    m1[3][i] = m3[1][i] - m3[3][i];
1310
116M
    m1[4][i] = m3[4][i] + m3[6][i];
1311
116M
    m1[5][i] = m3[5][i] + m3[7][i];
1312
116M
    m1[6][i] = m3[4][i] - m3[6][i];
1313
116M
    m1[7][i] = m3[5][i] - m3[7][i];
1314
1315
116M
    m2[0][i] = m1[0][i] + m1[1][i];
1316
116M
    m2[1][i] = m1[0][i] - m1[1][i];
1317
116M
    m2[2][i] = m1[2][i] + m1[3][i];
1318
116M
    m2[3][i] = m1[2][i] - m1[3][i];
1319
116M
    m2[4][i] = m1[4][i] + m1[5][i];
1320
116M
    m2[5][i] = m1[4][i] - m1[5][i];
1321
116M
    m2[6][i] = m1[6][i] + m1[7][i];
1322
116M
    m2[7][i] = m1[6][i] - m1[7][i];
1323
116M
  }
1324
1325
130M
  for (i = 0; i < 8; i++)
1326
116M
  {
1327
1.04G
    for (j = 0; j < 8; j++)
1328
931M
    {
1329
931M
      sad += abs(m2[i][j]);
1330
931M
    }
1331
116M
  }
1332
  
1333
14.5M
  sad -= abs( m2[0][0] );
1334
14.5M
  sad += abs( m2[0][0] ) >> 2;
1335
14.5M
  sad=((sad+2)>>2);
1336
1337
14.5M
  return sad;
1338
14.5M
}
1339
1340
static Distortion xCalcHADs16x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1341
853k
{   //need to add SIMD implementation ,JCA
1342
853k
  int k, i, j, jj, sad = 0;
1343
853k
  int diff[128], m1[8][16], m2[8][16];
1344
7.68M
  for( k = 0; k < 128; k += 16 )
1345
6.82M
  {
1346
6.82M
    diff[k + 0] = piOrg[0] - piCur[0];
1347
6.82M
    diff[k + 1] = piOrg[1] - piCur[1];
1348
6.82M
    diff[k + 2] = piOrg[2] - piCur[2];
1349
6.82M
    diff[k + 3] = piOrg[3] - piCur[3];
1350
6.82M
    diff[k + 4] = piOrg[4] - piCur[4];
1351
6.82M
    diff[k + 5] = piOrg[5] - piCur[5];
1352
6.82M
    diff[k + 6] = piOrg[6] - piCur[6];
1353
6.82M
    diff[k + 7] = piOrg[7] - piCur[7];
1354
1355
6.82M
    diff[k + 8] = piOrg[8] - piCur[8];
1356
6.82M
    diff[k + 9] = piOrg[9] - piCur[9];
1357
6.82M
    diff[k + 10] = piOrg[10] - piCur[10];
1358
6.82M
    diff[k + 11] = piOrg[11] - piCur[11];
1359
6.82M
    diff[k + 12] = piOrg[12] - piCur[12];
1360
6.82M
    diff[k + 13] = piOrg[13] - piCur[13];
1361
6.82M
    diff[k + 14] = piOrg[14] - piCur[14];
1362
6.82M
    diff[k + 15] = piOrg[15] - piCur[15];
1363
1364
6.82M
    piCur += iStrideCur;
1365
6.82M
    piOrg += iStrideOrg;
1366
6.82M
  }
1367
1368
  //horizontal
1369
7.68M
  for( j = 0; j < 8; j++ )
1370
6.82M
  {
1371
6.82M
    jj = j << 4;
1372
1373
6.82M
    m2[j][0] = diff[jj    ] + diff[jj + 8];
1374
6.82M
    m2[j][1] = diff[jj + 1] + diff[jj + 9];
1375
6.82M
    m2[j][2] = diff[jj + 2] + diff[jj + 10];
1376
6.82M
    m2[j][3] = diff[jj + 3] + diff[jj + 11];
1377
6.82M
    m2[j][4] = diff[jj + 4] + diff[jj + 12];
1378
6.82M
    m2[j][5] = diff[jj + 5] + diff[jj + 13];
1379
6.82M
    m2[j][6] = diff[jj + 6] + diff[jj + 14];
1380
6.82M
    m2[j][7] = diff[jj + 7] + diff[jj + 15];
1381
6.82M
    m2[j][8] = diff[jj    ] - diff[jj + 8];
1382
6.82M
    m2[j][9] = diff[jj + 1] - diff[jj + 9];
1383
6.82M
    m2[j][10] = diff[jj + 2] - diff[jj + 10];
1384
6.82M
    m2[j][11] = diff[jj + 3] - diff[jj + 11];
1385
6.82M
    m2[j][12] = diff[jj + 4] - diff[jj + 12];
1386
6.82M
    m2[j][13] = diff[jj + 5] - diff[jj + 13];
1387
6.82M
    m2[j][14] = diff[jj + 6] - diff[jj + 14];
1388
6.82M
    m2[j][15] = diff[jj + 7] - diff[jj + 15];
1389
1390
6.82M
    m1[j][0] = m2[j][0] + m2[j][4];
1391
6.82M
    m1[j][1] = m2[j][1] + m2[j][5];
1392
6.82M
    m1[j][2] = m2[j][2] + m2[j][6];
1393
6.82M
    m1[j][3] = m2[j][3] + m2[j][7];
1394
6.82M
    m1[j][4] = m2[j][0] - m2[j][4];
1395
6.82M
    m1[j][5] = m2[j][1] - m2[j][5];
1396
6.82M
    m1[j][6] = m2[j][2] - m2[j][6];
1397
6.82M
    m1[j][7] = m2[j][3] - m2[j][7];
1398
6.82M
    m1[j][8] = m2[j][8] + m2[j][12];
1399
6.82M
    m1[j][9] = m2[j][9] + m2[j][13];
1400
6.82M
    m1[j][10] = m2[j][10] + m2[j][14];
1401
6.82M
    m1[j][11] = m2[j][11] + m2[j][15];
1402
6.82M
    m1[j][12] = m2[j][8] - m2[j][12];
1403
6.82M
    m1[j][13] = m2[j][9] - m2[j][13];
1404
6.82M
    m1[j][14] = m2[j][10] - m2[j][14];
1405
6.82M
    m1[j][15] = m2[j][11] - m2[j][15];
1406
1407
6.82M
    m2[j][0] = m1[j][0] + m1[j][2];
1408
6.82M
    m2[j][1] = m1[j][1] + m1[j][3];
1409
6.82M
    m2[j][2] = m1[j][0] - m1[j][2];
1410
6.82M
    m2[j][3] = m1[j][1] - m1[j][3];
1411
6.82M
    m2[j][4] = m1[j][4] + m1[j][6];
1412
6.82M
    m2[j][5] = m1[j][5] + m1[j][7];
1413
6.82M
    m2[j][6] = m1[j][4] - m1[j][6];
1414
6.82M
    m2[j][7] = m1[j][5] - m1[j][7];
1415
6.82M
    m2[j][8] = m1[j][8] + m1[j][10];
1416
6.82M
    m2[j][9] = m1[j][9] + m1[j][11];
1417
6.82M
    m2[j][10] = m1[j][8] - m1[j][10];
1418
6.82M
    m2[j][11] = m1[j][9] - m1[j][11];
1419
6.82M
    m2[j][12] = m1[j][12] + m1[j][14];
1420
6.82M
    m2[j][13] = m1[j][13] + m1[j][15];
1421
6.82M
    m2[j][14] = m1[j][12] - m1[j][14];
1422
6.82M
    m2[j][15] = m1[j][13] - m1[j][15];
1423
1424
6.82M
    m1[j][0] = m2[j][0] + m2[j][1];
1425
6.82M
    m1[j][1] = m2[j][0] - m2[j][1];
1426
6.82M
    m1[j][2] = m2[j][2] + m2[j][3];
1427
6.82M
    m1[j][3] = m2[j][2] - m2[j][3];
1428
6.82M
    m1[j][4] = m2[j][4] + m2[j][5];
1429
6.82M
    m1[j][5] = m2[j][4] - m2[j][5];
1430
6.82M
    m1[j][6] = m2[j][6] + m2[j][7];
1431
6.82M
    m1[j][7] = m2[j][6] - m2[j][7];
1432
6.82M
    m1[j][8] = m2[j][8] + m2[j][9];
1433
6.82M
    m1[j][9] = m2[j][8] - m2[j][9];
1434
6.82M
    m1[j][10] = m2[j][10] + m2[j][11];
1435
6.82M
    m1[j][11] = m2[j][10] - m2[j][11];
1436
6.82M
    m1[j][12] = m2[j][12] + m2[j][13];
1437
6.82M
    m1[j][13] = m2[j][12] - m2[j][13];
1438
6.82M
    m1[j][14] = m2[j][14] + m2[j][15];
1439
6.82M
    m1[j][15] = m2[j][14] - m2[j][15];
1440
6.82M
  }
1441
1442
  //vertical
1443
14.5M
  for( i = 0; i < 16; i++ )
1444
13.6M
  {
1445
13.6M
    m2[0][i] = m1[0][i] + m1[4][i];
1446
13.6M
    m2[1][i] = m1[1][i] + m1[5][i];
1447
13.6M
    m2[2][i] = m1[2][i] + m1[6][i];
1448
13.6M
    m2[3][i] = m1[3][i] + m1[7][i];
1449
13.6M
    m2[4][i] = m1[0][i] - m1[4][i];
1450
13.6M
    m2[5][i] = m1[1][i] - m1[5][i];
1451
13.6M
    m2[6][i] = m1[2][i] - m1[6][i];
1452
13.6M
    m2[7][i] = m1[3][i] - m1[7][i];
1453
1454
13.6M
    m1[0][i] = m2[0][i] + m2[2][i];
1455
13.6M
    m1[1][i] = m2[1][i] + m2[3][i];
1456
13.6M
    m1[2][i] = m2[0][i] - m2[2][i];
1457
13.6M
    m1[3][i] = m2[1][i] - m2[3][i];
1458
13.6M
    m1[4][i] = m2[4][i] + m2[6][i];
1459
13.6M
    m1[5][i] = m2[5][i] + m2[7][i];
1460
13.6M
    m1[6][i] = m2[4][i] - m2[6][i];
1461
13.6M
    m1[7][i] = m2[5][i] - m2[7][i];
1462
1463
13.6M
    m2[0][i] = m1[0][i] + m1[1][i];
1464
13.6M
    m2[1][i] = m1[0][i] - m1[1][i];
1465
13.6M
    m2[2][i] = m1[2][i] + m1[3][i];
1466
13.6M
    m2[3][i] = m1[2][i] - m1[3][i];
1467
13.6M
    m2[4][i] = m1[4][i] + m1[5][i];
1468
13.6M
    m2[5][i] = m1[4][i] - m1[5][i];
1469
13.6M
    m2[6][i] = m1[6][i] + m1[7][i];
1470
13.6M
    m2[7][i] = m1[6][i] - m1[7][i];
1471
13.6M
  }
1472
1473
7.68M
  for( i = 0; i < 8; i++ )
1474
6.82M
  {
1475
116M
    for( j = 0; j < 16; j++ )
1476
109M
    {
1477
109M
      sad += abs( m2[i][j] );
1478
109M
    }
1479
6.82M
  }
1480
  
1481
853k
  sad -= abs( m2[0][0] );
1482
853k
  sad += abs( m2[0][0] ) >> 2;
1483
853k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1484
1485
853k
  return sad;
1486
853k
}
1487
1488
static Distortion xCalcHADs8x16( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1489
831k
{
1490
831k
  int k, i, j, jj, sad = 0;
1491
831k
  int diff[128], m1[16][8], m2[16][8];
1492
14.1M
  for( k = 0; k < 128; k += 8 )
1493
13.3M
  {
1494
13.3M
    diff[k + 0] = piOrg[0] - piCur[0];
1495
13.3M
    diff[k + 1] = piOrg[1] - piCur[1];
1496
13.3M
    diff[k + 2] = piOrg[2] - piCur[2];
1497
13.3M
    diff[k + 3] = piOrg[3] - piCur[3];
1498
13.3M
    diff[k + 4] = piOrg[4] - piCur[4];
1499
13.3M
    diff[k + 5] = piOrg[5] - piCur[5];
1500
13.3M
    diff[k + 6] = piOrg[6] - piCur[6];
1501
13.3M
    diff[k + 7] = piOrg[7] - piCur[7];
1502
1503
13.3M
    piCur += iStrideCur;
1504
13.3M
    piOrg += iStrideOrg;
1505
13.3M
  }
1506
1507
  //horizontal
1508
14.1M
  for( j = 0; j < 16; j++ )
1509
13.3M
  {
1510
13.3M
    jj = j << 3;
1511
1512
13.3M
    m2[j][0] = diff[jj] + diff[jj + 4];
1513
13.3M
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1514
13.3M
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1515
13.3M
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1516
13.3M
    m2[j][4] = diff[jj] - diff[jj + 4];
1517
13.3M
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1518
13.3M
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1519
13.3M
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1520
1521
13.3M
    m1[j][0] = m2[j][0] + m2[j][2];
1522
13.3M
    m1[j][1] = m2[j][1] + m2[j][3];
1523
13.3M
    m1[j][2] = m2[j][0] - m2[j][2];
1524
13.3M
    m1[j][3] = m2[j][1] - m2[j][3];
1525
13.3M
    m1[j][4] = m2[j][4] + m2[j][6];
1526
13.3M
    m1[j][5] = m2[j][5] + m2[j][7];
1527
13.3M
    m1[j][6] = m2[j][4] - m2[j][6];
1528
13.3M
    m1[j][7] = m2[j][5] - m2[j][7];
1529
1530
13.3M
    m2[j][0] = m1[j][0] + m1[j][1];
1531
13.3M
    m2[j][1] = m1[j][0] - m1[j][1];
1532
13.3M
    m2[j][2] = m1[j][2] + m1[j][3];
1533
13.3M
    m2[j][3] = m1[j][2] - m1[j][3];
1534
13.3M
    m2[j][4] = m1[j][4] + m1[j][5];
1535
13.3M
    m2[j][5] = m1[j][4] - m1[j][5];
1536
13.3M
    m2[j][6] = m1[j][6] + m1[j][7];
1537
13.3M
    m2[j][7] = m1[j][6] - m1[j][7];
1538
13.3M
  }
1539
1540
  //vertical
1541
7.48M
  for( i = 0; i < 8; i++ )
1542
6.65M
  {
1543
6.65M
    m1[0][i] = m2[0][i] + m2[8][i];
1544
6.65M
    m1[1][i] = m2[1][i] + m2[9][i];
1545
6.65M
    m1[2][i] = m2[2][i] + m2[10][i];
1546
6.65M
    m1[3][i] = m2[3][i] + m2[11][i];
1547
6.65M
    m1[4][i] = m2[4][i] + m2[12][i];
1548
6.65M
    m1[5][i] = m2[5][i] + m2[13][i];
1549
6.65M
    m1[6][i] = m2[6][i] + m2[14][i];
1550
6.65M
    m1[7][i] = m2[7][i] + m2[15][i];
1551
6.65M
    m1[8][i] = m2[0][i] - m2[8][i];
1552
6.65M
    m1[9][i] = m2[1][i] - m2[9][i];
1553
6.65M
    m1[10][i] = m2[2][i] - m2[10][i];
1554
6.65M
    m1[11][i] = m2[3][i] - m2[11][i];
1555
6.65M
    m1[12][i] = m2[4][i] - m2[12][i];
1556
6.65M
    m1[13][i] = m2[5][i] - m2[13][i];
1557
6.65M
    m1[14][i] = m2[6][i] - m2[14][i];
1558
6.65M
    m1[15][i] = m2[7][i] - m2[15][i];
1559
1560
6.65M
    m2[0][i] = m1[0][i] + m1[4][i];
1561
6.65M
    m2[1][i] = m1[1][i] + m1[5][i];
1562
6.65M
    m2[2][i] = m1[2][i] + m1[6][i];
1563
6.65M
    m2[3][i] = m1[3][i] + m1[7][i];
1564
6.65M
    m2[4][i] = m1[0][i] - m1[4][i];
1565
6.65M
    m2[5][i] = m1[1][i] - m1[5][i];
1566
6.65M
    m2[6][i] = m1[2][i] - m1[6][i];
1567
6.65M
    m2[7][i] = m1[3][i] - m1[7][i];
1568
6.65M
    m2[8][i] = m1[8][i] + m1[12][i];
1569
6.65M
    m2[9][i] = m1[9][i] + m1[13][i];
1570
6.65M
    m2[10][i] = m1[10][i] + m1[14][i];
1571
6.65M
    m2[11][i] = m1[11][i] + m1[15][i];
1572
6.65M
    m2[12][i] = m1[8][i] - m1[12][i];
1573
6.65M
    m2[13][i] = m1[9][i] - m1[13][i];
1574
6.65M
    m2[14][i] = m1[10][i] - m1[14][i];
1575
6.65M
    m2[15][i] = m1[11][i] - m1[15][i];
1576
1577
6.65M
    m1[0][i] = m2[0][i] + m2[2][i];
1578
6.65M
    m1[1][i] = m2[1][i] + m2[3][i];
1579
6.65M
    m1[2][i] = m2[0][i] - m2[2][i];
1580
6.65M
    m1[3][i] = m2[1][i] - m2[3][i];
1581
6.65M
    m1[4][i] = m2[4][i] + m2[6][i];
1582
6.65M
    m1[5][i] = m2[5][i] + m2[7][i];
1583
6.65M
    m1[6][i] = m2[4][i] - m2[6][i];
1584
6.65M
    m1[7][i] = m2[5][i] - m2[7][i];
1585
6.65M
    m1[8][i] = m2[8][i] + m2[10][i];
1586
6.65M
    m1[9][i] = m2[9][i] + m2[11][i];
1587
6.65M
    m1[10][i] = m2[8][i] - m2[10][i];
1588
6.65M
    m1[11][i] = m2[9][i] - m2[11][i];
1589
6.65M
    m1[12][i] = m2[12][i] + m2[14][i];
1590
6.65M
    m1[13][i] = m2[13][i] + m2[15][i];
1591
6.65M
    m1[14][i] = m2[12][i] - m2[14][i];
1592
6.65M
    m1[15][i] = m2[13][i] - m2[15][i];
1593
1594
6.65M
    m2[0][i] = m1[0][i] + m1[1][i];
1595
6.65M
    m2[1][i] = m1[0][i] - m1[1][i];
1596
6.65M
    m2[2][i] = m1[2][i] + m1[3][i];
1597
6.65M
    m2[3][i] = m1[2][i] - m1[3][i];
1598
6.65M
    m2[4][i] = m1[4][i] + m1[5][i];
1599
6.65M
    m2[5][i] = m1[4][i] - m1[5][i];
1600
6.65M
    m2[6][i] = m1[6][i] + m1[7][i];
1601
6.65M
    m2[7][i] = m1[6][i] - m1[7][i];
1602
6.65M
    m2[8][i] = m1[8][i] + m1[9][i];
1603
6.65M
    m2[9][i] = m1[8][i] - m1[9][i];
1604
6.65M
    m2[10][i] = m1[10][i] + m1[11][i];
1605
6.65M
    m2[11][i] = m1[10][i] - m1[11][i];
1606
6.65M
    m2[12][i] = m1[12][i] + m1[13][i];
1607
6.65M
    m2[13][i] = m1[12][i] - m1[13][i];
1608
6.65M
    m2[14][i] = m1[14][i] + m1[15][i];
1609
6.65M
    m2[15][i] = m1[14][i] - m1[15][i];
1610
6.65M
  }
1611
1612
14.1M
  for( i = 0; i < 16; i++ )
1613
13.3M
  {
1614
119M
    for( j = 0; j < 8; j++ )
1615
106M
    {
1616
106M
      sad += abs( m2[i][j] );
1617
106M
    }
1618
13.3M
  }
1619
  
1620
831k
  sad -= abs( m2[0][0] );
1621
831k
  sad += abs( m2[0][0] ) >> 2;
1622
831k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1623
1624
831k
  return sad;
1625
831k
}
1626
1627
static Distortion xCalcHADs4x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1628
150k
{
1629
150k
  int k, i, j, jj, sad = 0;
1630
150k
  int diff[32], m1[8][4], m2[8][4];
1631
1.35M
  for( k = 0; k < 32; k += 4 )
1632
1.20M
  {
1633
1.20M
    diff[k + 0] = piOrg[0] - piCur[0];
1634
1.20M
    diff[k + 1] = piOrg[1] - piCur[1];
1635
1.20M
    diff[k + 2] = piOrg[2] - piCur[2];
1636
1.20M
    diff[k + 3] = piOrg[3] - piCur[3];
1637
1638
1.20M
    piCur += iStrideCur;
1639
1.20M
    piOrg += iStrideOrg;
1640
1.20M
  }
1641
1642
  //horizontal
1643
1.35M
  for( j = 0; j < 8; j++ )
1644
1.20M
  {
1645
1.20M
    jj = j << 2;
1646
1.20M
    m2[j][0] = diff[jj] + diff[jj + 2];
1647
1.20M
    m2[j][1] = diff[jj + 1] + diff[jj + 3];
1648
1.20M
    m2[j][2] = diff[jj] - diff[jj + 2];
1649
1.20M
    m2[j][3] = diff[jj + 1] - diff[jj + 3];
1650
1651
1.20M
    m1[j][0] = m2[j][0] + m2[j][1];
1652
1.20M
    m1[j][1] = m2[j][0] - m2[j][1];
1653
1.20M
    m1[j][2] = m2[j][2] + m2[j][3];
1654
1.20M
    m1[j][3] = m2[j][2] - m2[j][3];
1655
1.20M
  }
1656
1657
  //vertical
1658
753k
  for( i = 0; i < 4; i++ )
1659
602k
  {
1660
602k
    m2[0][i] = m1[0][i] + m1[4][i];
1661
602k
    m2[1][i] = m1[1][i] + m1[5][i];
1662
602k
    m2[2][i] = m1[2][i] + m1[6][i];
1663
602k
    m2[3][i] = m1[3][i] + m1[7][i];
1664
602k
    m2[4][i] = m1[0][i] - m1[4][i];
1665
602k
    m2[5][i] = m1[1][i] - m1[5][i];
1666
602k
    m2[6][i] = m1[2][i] - m1[6][i];
1667
602k
    m2[7][i] = m1[3][i] - m1[7][i];
1668
1669
602k
    m1[0][i] = m2[0][i] + m2[2][i];
1670
602k
    m1[1][i] = m2[1][i] + m2[3][i];
1671
602k
    m1[2][i] = m2[0][i] - m2[2][i];
1672
602k
    m1[3][i] = m2[1][i] - m2[3][i];
1673
602k
    m1[4][i] = m2[4][i] + m2[6][i];
1674
602k
    m1[5][i] = m2[5][i] + m2[7][i];
1675
602k
    m1[6][i] = m2[4][i] - m2[6][i];
1676
602k
    m1[7][i] = m2[5][i] - m2[7][i];
1677
1678
602k
    m2[0][i] = m1[0][i] + m1[1][i];
1679
602k
    m2[1][i] = m1[0][i] - m1[1][i];
1680
602k
    m2[2][i] = m1[2][i] + m1[3][i];
1681
602k
    m2[3][i] = m1[2][i] - m1[3][i];
1682
602k
    m2[4][i] = m1[4][i] + m1[5][i];
1683
602k
    m2[5][i] = m1[4][i] - m1[5][i];
1684
602k
    m2[6][i] = m1[6][i] + m1[7][i];
1685
602k
    m2[7][i] = m1[6][i] - m1[7][i];
1686
602k
  }
1687
1688
1.35M
  for( i = 0; i < 8; i++ )
1689
1.20M
  {
1690
6.02M
    for( j = 0; j < 4; j++ )
1691
4.81M
    {
1692
4.81M
      sad += abs( m2[i][j] );
1693
4.81M
    }
1694
1.20M
  }
1695
  
1696
150k
  sad -= abs( m2[0][0] );
1697
150k
  sad += abs( m2[0][0] ) >> 2;
1698
150k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1699
1700
150k
  return sad;
1701
150k
}
1702
1703
static Distortion xCalcHADs8x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1704
161k
{
1705
161k
  int k, i, j, jj, sad = 0;
1706
161k
  int diff[32], m1[4][8], m2[4][8];
1707
808k
  for( k = 0; k < 32; k += 8 )
1708
646k
  {
1709
646k
    diff[k + 0] = piOrg[0] - piCur[0];
1710
646k
    diff[k + 1] = piOrg[1] - piCur[1];
1711
646k
    diff[k + 2] = piOrg[2] - piCur[2];
1712
646k
    diff[k + 3] = piOrg[3] - piCur[3];
1713
646k
    diff[k + 4] = piOrg[4] - piCur[4];
1714
646k
    diff[k + 5] = piOrg[5] - piCur[5];
1715
646k
    diff[k + 6] = piOrg[6] - piCur[6];
1716
646k
    diff[k + 7] = piOrg[7] - piCur[7];
1717
1718
646k
    piCur += iStrideCur;
1719
646k
    piOrg += iStrideOrg;
1720
646k
  }
1721
1722
  //horizontal
1723
808k
  for( j = 0; j < 4; j++ )
1724
646k
  {
1725
646k
    jj = j << 3;
1726
1727
646k
    m2[j][0] = diff[jj] + diff[jj + 4];
1728
646k
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1729
646k
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1730
646k
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1731
646k
    m2[j][4] = diff[jj] - diff[jj + 4];
1732
646k
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1733
646k
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1734
646k
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1735
1736
646k
    m1[j][0] = m2[j][0] + m2[j][2];
1737
646k
    m1[j][1] = m2[j][1] + m2[j][3];
1738
646k
    m1[j][2] = m2[j][0] - m2[j][2];
1739
646k
    m1[j][3] = m2[j][1] - m2[j][3];
1740
646k
    m1[j][4] = m2[j][4] + m2[j][6];
1741
646k
    m1[j][5] = m2[j][5] + m2[j][7];
1742
646k
    m1[j][6] = m2[j][4] - m2[j][6];
1743
646k
    m1[j][7] = m2[j][5] - m2[j][7];
1744
1745
646k
    m2[j][0] = m1[j][0] + m1[j][1];
1746
646k
    m2[j][1] = m1[j][0] - m1[j][1];
1747
646k
    m2[j][2] = m1[j][2] + m1[j][3];
1748
646k
    m2[j][3] = m1[j][2] - m1[j][3];
1749
646k
    m2[j][4] = m1[j][4] + m1[j][5];
1750
646k
    m2[j][5] = m1[j][4] - m1[j][5];
1751
646k
    m2[j][6] = m1[j][6] + m1[j][7];
1752
646k
    m2[j][7] = m1[j][6] - m1[j][7];
1753
646k
  }
1754
1755
  //vertical
1756
1.45M
  for( i = 0; i < 8; i++ )
1757
1.29M
  {
1758
1.29M
    m1[0][i] = m2[0][i] + m2[2][i];
1759
1.29M
    m1[1][i] = m2[1][i] + m2[3][i];
1760
1.29M
    m1[2][i] = m2[0][i] - m2[2][i];
1761
1.29M
    m1[3][i] = m2[1][i] - m2[3][i];
1762
1763
1.29M
    m2[0][i] = m1[0][i] + m1[1][i];
1764
1.29M
    m2[1][i] = m1[0][i] - m1[1][i];
1765
1.29M
    m2[2][i] = m1[2][i] + m1[3][i];
1766
1.29M
    m2[3][i] = m1[2][i] - m1[3][i];
1767
1.29M
  }
1768
1769
808k
  for( i = 0; i < 4; i++ )
1770
646k
  {
1771
5.82M
    for( j = 0; j < 8; j++ )
1772
5.17M
    {
1773
5.17M
      sad += abs( m2[i][j] );
1774
5.17M
    }
1775
646k
  }
1776
  
1777
161k
  sad -= abs( m2[0][0] );
1778
161k
  sad += abs( m2[0][0] ) >> 2;
1779
161k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1780
1781
161k
  return sad;
1782
161k
}
1783
1784
Distortion RdCost::xGetHAD2SADs( const DistParam &rcDtParam )
1785
864k
{
1786
864k
  if( rcDtParam.applyWeight )
1787
0
  {
1788
0
    THROW(" no support");
1789
0
  }
1790
1791
864k
  Distortion distHad = xGetHADs<false>( rcDtParam );
1792
864k
  Distortion distSad = 0;
1793
864k
  {
1794
864k
    CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering");
1795
1796
864k
    const Pel* piOrg  = rcDtParam.org.buf;
1797
864k
    const Pel* piCur  = rcDtParam.cur.buf;
1798
864k
    int  iRows        = rcDtParam.org.height>>2;
1799
864k
    int  iCols        = rcDtParam.org.width<<2;
1800
1801
864k
    Distortion uiSum = 0;
1802
1803
7.46M
    for( int y = 0; y < iRows;  y++ )
1804
6.60M
    {
1805
71.2M
      for (int n = 0; n < iCols; n+=16 )
1806
64.5M
      {
1807
64.5M
        uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
1808
64.5M
        uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
1809
64.5M
        uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
1810
64.5M
        uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
1811
64.5M
        uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
1812
64.5M
        uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
1813
64.5M
        uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
1814
64.5M
        uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
1815
64.5M
        uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
1816
64.5M
        uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
1817
64.5M
        uiSum += abs( piOrg[n+10] - piCur[n+10] );
1818
64.5M
        uiSum += abs( piOrg[n+11] - piCur[n+11] );
1819
64.5M
        uiSum += abs( piOrg[n+12] - piCur[n+12] );
1820
64.5M
        uiSum += abs( piOrg[n+13] - piCur[n+13] );
1821
64.5M
        uiSum += abs( piOrg[n+14] - piCur[n+14] );
1822
64.5M
        uiSum += abs( piOrg[n+15] - piCur[n+15] );
1823
64.5M
      }
1824
6.60M
      piOrg += iCols;
1825
6.60M
      piCur += iCols;
1826
6.60M
    }
1827
1828
864k
    distSad = (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1829
864k
  }
1830
1831
0
  return std::min( distHad, 2*distSad);
1832
864k
}
1833
1834
template<bool fastHad>
1835
Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
1836
1.42M
{
1837
1.42M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.42M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.42M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.42M
  const int  iRows = rcDtParam.org.height;
1844
1.42M
  const int  iCols = rcDtParam.org.width;
1845
1.42M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.42M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.42M
  int  x = 0, y = 0;
1849
1850
1.42M
  Distortion uiSum = 0;
1851
1852
1.42M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
315k
  {
1854
776k
    for( y = 0; y < iRows; y += 8 )
1855
461k
    {
1856
1.31M
      for( x = 0; x < iCols; x += 16 )
1857
853k
      {
1858
853k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
853k
      }
1860
461k
      piOrg += iStrideOrg * 8;
1861
461k
      piCur += iStrideCur * 8;
1862
461k
    }
1863
315k
  }
1864
1.11M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
301k
  {
1866
844k
    for( y = 0; y < iRows; y += 16 )
1867
543k
    {
1868
1.37M
      for( x = 0; x < iCols; x += 8 )
1869
831k
      {
1870
831k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
831k
      }
1872
543k
      piOrg += iStrideOrg * 16;
1873
543k
      piCur += iStrideCur * 16;
1874
543k
    }
1875
301k
  }
1876
809k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
86.6k
  {
1878
173k
    for( y = 0; y < iRows; y += 4 )
1879
86.6k
    {
1880
248k
      for( x = 0; x < iCols; x += 8 )
1881
161k
      {
1882
161k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
161k
      }
1884
86.6k
      piOrg += iStrideOrg * 4;
1885
86.6k
      piCur += iStrideCur * 4;
1886
86.6k
    }
1887
86.6k
  }
1888
722k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
80.0k
  {
1890
230k
    for( y = 0; y < iRows; y += 8 )
1891
150k
    {
1892
301k
      for( x = 0; x < iCols; x += 4 )
1893
150k
      {
1894
150k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
150k
      }
1896
150k
      piOrg += iStrideOrg * 8;
1897
150k
      piCur += iStrideCur * 8;
1898
150k
    }
1899
80.0k
  }
1900
642k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
642k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
575k
  {
1914
3.04M
    for( y = 0; y < iRows; y += 8 )
1915
2.47M
    {
1916
17.0M
      for( x = 0; x < iCols; x += 8 )
1917
14.5M
      {
1918
14.5M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
14.5M
      }
1920
2.47M
      piOrg += 8*iStrideOrg;
1921
2.47M
      piCur += 8*iStrideCur;
1922
2.47M
    }
1923
575k
  }
1924
67.1k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
40.5k
  {
1926
81.0k
    for( y = 0; y < iRows; y += 4 )
1927
40.5k
    {
1928
81.0k
      for( x = 0; x < iCols; x += 4 )
1929
40.5k
      {
1930
40.5k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
40.5k
      }
1932
40.5k
      piOrg += 4*iStrideOrg;
1933
40.5k
      piCur += 4*iStrideCur;
1934
40.5k
    }
1935
40.5k
  }
1936
26.7k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
26.7k
  {
1938
53.4k
    for( y = 0; y < iRows; y += 2 )
1939
26.7k
    {
1940
174k
      for( x = 0; x < iCols; x += 2 )
1941
148k
      {
1942
148k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
148k
      }
1944
26.7k
      piOrg += 2*iStrideOrg;
1945
26.7k
      piCur += 2*iStrideCur;
1946
26.7k
    }
1947
26.7k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.42M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.42M
}
unsigned long vvenc::RdCost::xGetHADs<false>(vvenc::DistParam const&)
Line
Count
Source
1836
1.42M
{
1837
1.42M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.42M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.42M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.42M
  const int  iRows = rcDtParam.org.height;
1844
1.42M
  const int  iCols = rcDtParam.org.width;
1845
1.42M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.42M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.42M
  int  x = 0, y = 0;
1849
1850
1.42M
  Distortion uiSum = 0;
1851
1852
1.42M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
315k
  {
1854
776k
    for( y = 0; y < iRows; y += 8 )
1855
461k
    {
1856
1.31M
      for( x = 0; x < iCols; x += 16 )
1857
853k
      {
1858
853k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
853k
      }
1860
461k
      piOrg += iStrideOrg * 8;
1861
461k
      piCur += iStrideCur * 8;
1862
461k
    }
1863
315k
  }
1864
1.11M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
301k
  {
1866
844k
    for( y = 0; y < iRows; y += 16 )
1867
543k
    {
1868
1.37M
      for( x = 0; x < iCols; x += 8 )
1869
831k
      {
1870
831k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
831k
      }
1872
543k
      piOrg += iStrideOrg * 16;
1873
543k
      piCur += iStrideCur * 16;
1874
543k
    }
1875
301k
  }
1876
809k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
86.6k
  {
1878
173k
    for( y = 0; y < iRows; y += 4 )
1879
86.6k
    {
1880
248k
      for( x = 0; x < iCols; x += 8 )
1881
161k
      {
1882
161k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
161k
      }
1884
86.6k
      piOrg += iStrideOrg * 4;
1885
86.6k
      piCur += iStrideCur * 4;
1886
86.6k
    }
1887
86.6k
  }
1888
722k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
80.0k
  {
1890
230k
    for( y = 0; y < iRows; y += 8 )
1891
150k
    {
1892
301k
      for( x = 0; x < iCols; x += 4 )
1893
150k
      {
1894
150k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
150k
      }
1896
150k
      piOrg += iStrideOrg * 8;
1897
150k
      piCur += iStrideCur * 8;
1898
150k
    }
1899
80.0k
  }
1900
642k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
642k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
575k
  {
1914
3.04M
    for( y = 0; y < iRows; y += 8 )
1915
2.47M
    {
1916
17.0M
      for( x = 0; x < iCols; x += 8 )
1917
14.5M
      {
1918
14.5M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
14.5M
      }
1920
2.47M
      piOrg += 8*iStrideOrg;
1921
2.47M
      piCur += 8*iStrideCur;
1922
2.47M
    }
1923
575k
  }
1924
67.1k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
40.5k
  {
1926
81.0k
    for( y = 0; y < iRows; y += 4 )
1927
40.5k
    {
1928
81.0k
      for( x = 0; x < iCols; x += 4 )
1929
40.5k
      {
1930
40.5k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
40.5k
      }
1932
40.5k
      piOrg += 4*iStrideOrg;
1933
40.5k
      piCur += 4*iStrideCur;
1934
40.5k
    }
1935
40.5k
  }
1936
26.7k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
26.7k
  {
1938
53.4k
    for( y = 0; y < iRows; y += 2 )
1939
26.7k
    {
1940
174k
      for( x = 0; x < iCols; x += 2 )
1941
148k
      {
1942
148k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
148k
      }
1944
26.7k
      piOrg += 2*iStrideOrg;
1945
26.7k
      piCur += 2*iStrideCur;
1946
26.7k
    }
1947
26.7k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.42M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.42M
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs<true>(vvenc::DistParam const&)
1955
1956
1957
void RdCost::saveUnadjustedLambda()
1958
11.2k
{
1959
11.2k
  m_dLambda_unadjusted = m_dLambda;
1960
11.2k
  m_DistScaleUnadjusted = m_DistScale;
1961
11.2k
}
1962
1963
1964
inline Distortion getWeightedMSE(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift)
1965
0
{
1966
0
  const Intermediate_Int iTemp = org - cur;
1967
0
  return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift);
1968
0
}
1969
1970
template<int csx>
1971
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights )
1972
0
{
1973
0
        int  iRows = rcDtParam.org.height;
1974
0
  const Pel* piOrg = rcDtParam.org.buf;
1975
0
  const Pel* piCur = rcDtParam.cur.buf;
1976
0
  const int  iCols = rcDtParam.org.width;
1977
0
  const int  iStrideCur = rcDtParam.cur.stride;
1978
0
  const int  iStrideOrg = rcDtParam.org.stride;
1979
0
  const Pel* piOrgLuma        = rcDtParam.orgLuma->buf;
1980
0
  const int  iStrideOrgLuma   = rcDtParam.orgLuma->stride;
1981
1982
0
  Distortion uiSum   = 0;
1983
0
  uint32_t uiShift   = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
1984
1985
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
1986
0
  const int cf =  1 - ( iCols & 1 );
1987
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
1988
0
  const ComponentID compId = rcDtParam.compID;
1989
0
  const size_t  cShiftY    = getComponentScaleY(compId, chmFmt);
1990
1991
0
  for( ; iRows != 0; iRows-- )
1992
0
  {
1993
0
    for (int n = 0; n < iCols; n+=2 )
1994
0
    {
1995
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], lumaWeights[piOrgLuma[(n   )<<csx]], uiShift );
1996
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], lumaWeights[piOrgLuma[(n+cf)<<csx]], uiShift );
1997
0
    }
1998
1999
0
    piOrg     += iStrideOrg;
2000
0
    piCur     += iStrideCur;
2001
0
    piOrgLuma += iStrideOrgLuma<<cShiftY;
2002
0
  }
2003
2004
0
  return ( uiSum >> ( 1 - cf ) );
2005
0
}
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
2006
2007
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedPTweight )
2008
0
{
2009
0
        int  iRows = rcDtParam.org.height;
2010
0
  const Pel* piOrg = rcDtParam.org.buf;
2011
0
  const Pel* piCur = rcDtParam.cur.buf;
2012
0
  const int  iCols = rcDtParam.org.width;
2013
0
  const int  iStrideCur = rcDtParam.cur.stride;
2014
0
  const int  iStrideOrg = rcDtParam.org.stride;
2015
2016
0
  Distortion uiSum   = 0;
2017
0
  uint32_t uiShift = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
2018
2019
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
2020
0
  const int cf =  1 - ( iCols & 1 );
2021
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
2022
  
2023
0
  for( ; iRows != 0; iRows-- )
2024
0
  {
2025
0
    for (int n = 0; n < iCols; n+=2 )
2026
0
    {
2027
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], fixedPTweight, uiShift );
2028
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], fixedPTweight, uiShift );
2029
0
    }
2030
0
    piOrg += iStrideOrg;
2031
0
    piCur += iStrideCur;
2032
0
  }
2033
2034
0
  return ( uiSum >> ( 1 - cf ) );
2035
0
}
2036
2037
Distortion RdCost::xGetSSE_WTD( const DistParam &rcDtParam ) const
2038
0
{
2039
0
  if( rcDtParam.applyWeight )
2040
0
  {
2041
0
    THROW("no support");
2042
0
  }
2043
2044
0
  if ((m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG) && rcDtParam.compID != COMP_Y)
2045
0
  {
2046
0
    const uint32_t fixedPTweight = ( uint32_t ) ( m_chromaWeight * ( double ) ( 1 << 16 ) );
2047
2048
0
    return m_fxdWtdPredPtr( rcDtParam, fixedPTweight );
2049
0
  }
2050
0
  else
2051
0
  {
2052
0
    return m_wtdPredPtr[getComponentScaleX(rcDtParam.compID, m_cf)]( rcDtParam, m_cf, m_reshapeLumaLevelToWeightPLUT );
2053
0
  }
2054
2055
0
  return 0;
2056
0
}
2057
2058
0
void RdCost::xGetSAD8X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2059
0
  DistParam rcDtParamTmp0 = rcDtParam;
2060
2061
0
  DistParam rcDtParamTmp1 = rcDtParam;
2062
0
  rcDtParamTmp1.org.buf += 1;
2063
0
  rcDtParamTmp1.cur.buf -= 1;
2064
2065
0
  DistParam rcDtParamTmp2 = rcDtParam;
2066
0
  rcDtParamTmp2.org.buf += 2;
2067
0
  rcDtParamTmp2.cur.buf -= 2;
2068
2069
0
  DistParam rcDtParamTmp3 = rcDtParam;
2070
0
  rcDtParamTmp3.org.buf += 3;
2071
0
  rcDtParamTmp3.cur.buf -= 3;
2072
2073
0
  DistParam rcDtParamTmp4 = rcDtParam;
2074
0
  rcDtParamTmp4.org.buf += 4;
2075
0
  rcDtParamTmp4.cur.buf -= 4;
2076
  
2077
0
  cost[0] = (RdCost::xGetSAD8(rcDtParamTmp0)) >> 1;
2078
0
  cost[1] = (RdCost::xGetSAD8(rcDtParamTmp1)) >> 1;
2079
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD8(rcDtParamTmp2)) >> 1;
2080
0
  cost[3] = (RdCost::xGetSAD8(rcDtParamTmp3)) >> 1;
2081
0
  cost[4] = (RdCost::xGetSAD8(rcDtParamTmp4)) >> 1;
2082
0
}
2083
2084
0
void RdCost::xGetSAD16X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2085
0
  DistParam rcDtParamTmp0 = rcDtParam;
2086
2087
0
  DistParam rcDtParamTmp1 = rcDtParam;
2088
0
  rcDtParamTmp1.org.buf += 1;
2089
0
  rcDtParamTmp1.cur.buf -= 1;
2090
2091
0
  DistParam rcDtParamTmp2 = rcDtParam;
2092
0
  rcDtParamTmp2.org.buf += 2;
2093
0
  rcDtParamTmp2.cur.buf -= 2;
2094
2095
0
  DistParam rcDtParamTmp3 = rcDtParam;
2096
0
  rcDtParamTmp3.org.buf += 3;
2097
0
  rcDtParamTmp3.cur.buf -= 3;
2098
2099
0
  DistParam rcDtParamTmp4 = rcDtParam;
2100
0
  rcDtParamTmp4.org.buf += 4;
2101
0
  rcDtParamTmp4.cur.buf -= 4;
2102
  
2103
0
  cost[0] = (RdCost::xGetSAD16(rcDtParamTmp0)) >> 1;
2104
0
  cost[1] = (RdCost::xGetSAD16(rcDtParamTmp1)) >> 1;
2105
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD16(rcDtParamTmp2)) >> 1;
2106
0
  cost[3] = (RdCost::xGetSAD16(rcDtParamTmp3)) >> 1;
2107
0
  cost[4] = (RdCost::xGetSAD16(rcDtParamTmp4)) >> 1;
2108
0
}
2109
2110
void RdCost::setDistParamGeo(DistParam &rcDP, const CPelBuf &org, const Pel *piRefY, int iRefStride, const Pel *mask,
2111
                          int iMaskStride, int stepX, int iMaskStride2, int bitDepth, ComponentID compID)
2112
0
{
2113
0
  rcDP.bitDepth = bitDepth;
2114
0
  rcDP.compID   = compID;
2115
2116
  // set Original & Curr Pointer / Stride
2117
0
  rcDP.org        = org;
2118
0
  rcDP.cur.buf    = piRefY;
2119
0
  rcDP.cur.stride = iRefStride;
2120
2121
  // set Mask
2122
0
  rcDP.mask        = mask;
2123
0
  rcDP.maskStride  = iMaskStride;
2124
0
  rcDP.stepX       = stepX;
2125
0
  rcDP.maskStride2 = iMaskStride2;
2126
2127
  // set Block Width / Height
2128
0
  rcDP.cur.width                     = org.width;
2129
0
  rcDP.cur.height                    = org.height;
2130
0
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
2131
2132
  // set Cost function for motion estimation with Mask
2133
0
  rcDP.distFunc = m_afpDistortFunc[0][DF_SAD_WITH_MASK];
2134
0
}
2135
2136
Distortion RdCost::xGetSADwMask(const DistParam &rcDtParam)
2137
0
{
2138
0
  const Pel *    org             = rcDtParam.org.buf;
2139
0
  const Pel *    cur             = rcDtParam.cur.buf;
2140
0
  const Pel *    mask            = rcDtParam.mask;
2141
0
  const int      cols            = rcDtParam.org.width;
2142
0
  int            rows            = rcDtParam.org.height;
2143
0
  const int      subShift        = rcDtParam.subShift;
2144
0
  const int      subStep         = (1 << subShift);
2145
0
  const int      strideCur       = rcDtParam.cur.stride * subStep;
2146
0
  const int      strideOrg       = rcDtParam.org.stride * subStep;
2147
0
  const int      strideMask      = rcDtParam.maskStride * subStep;
2148
0
  const int      stepX           = rcDtParam.stepX;
2149
0
  const int      strideMask2     = rcDtParam.maskStride2;
2150
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2151
2152
0
  Distortion sum = 0;
2153
0
  for (; rows != 0; rows -= subStep)
2154
0
  {
2155
0
    for (int n = 0; n < cols; n++)
2156
0
    {
2157
0
      sum += abs(org[n] - cur[n]) * *mask;
2158
0
      mask += stepX;
2159
0
    }
2160
0
    org += strideOrg;
2161
0
    cur += strideCur;
2162
0
    mask += strideMask;
2163
0
    mask += strideMask2;
2164
0
  }
2165
0
  sum <<= subShift;
2166
0
  return (sum >> distortionShift);
2167
0
}
2168
2169
Distortion RdCost::getBvCostMultiplePredsIBC(int x, int y, bool useIMV)
2170
1.11M
{
2171
1.11M
  return Distortion(m_dCostIBC * getBitsMultiplePredsIBC(x, y, useIMV));
2172
1.11M
}
2173
2174
static inline unsigned getIComponentBitsIBC( int val )
2175
3.04M
{
2176
3.04M
  if( !val ) return 1;
2177
2178
18.4E
  const unsigned int l2 = floorLog2( (val <= 0) ? (-val << 1) + 1 : (val << 1) );
2179
2180
1.53M
  return (l2 << 1) + 1;
2181
3.04M
}
2182
2183
unsigned int RdCost::getBitsMultiplePredsIBC(int x, int y, bool useIMV)
2184
1.11M
{
2185
1.11M
  int rmvH[2];
2186
1.11M
  int rmvV[2];
2187
1.11M
  rmvH[0] = x - m_bvPredictors[0].hor;
2188
1.11M
  rmvH[1] = x - m_bvPredictors[1].hor;
2189
2190
1.11M
  rmvV[0] = y - m_bvPredictors[0].ver;
2191
1.11M
  rmvV[1] = y - m_bvPredictors[1].ver;
2192
1.11M
  int absCand[2];
2193
1.11M
  absCand[0] = abs(rmvH[0]) + abs(rmvV[0]);
2194
1.11M
  absCand[1] = abs(rmvH[1]) + abs(rmvV[1]);
2195
2196
1.11M
  if (useIMV && x % 4 == 0 && y % 4 == 0)
2197
408k
  {
2198
408k
    int rmvHQP[2];
2199
408k
    int rmvVQP[2];
2200
2201
408k
    int imvShift = 2;
2202
408k
    int offset = 1 << (imvShift - 1);
2203
2204
408k
    rmvHQP[0] = (x >> 2) - ((m_bvPredictors[0].hor + offset) >> 2);
2205
408k
    rmvHQP[1] = (x >> 2) - ((m_bvPredictors[1].hor + offset) >> 2);
2206
408k
    rmvVQP[0] = (y >> 2) - ((m_bvPredictors[0].ver + offset) >> 2);
2207
408k
    rmvVQP[1] = (y >> 2) - ((m_bvPredictors[1].ver + offset) >> 2);
2208
2209
408k
    int absCandQP[2];
2210
408k
    absCandQP[0] = abs(rmvHQP[0]) + abs(rmvVQP[0]);
2211
408k
    absCandQP[1] = abs(rmvHQP[1]) + abs(rmvVQP[1]);
2212
408k
    unsigned int candBits0QP, candBits1QP;
2213
408k
    if (absCand[0] < absCand[1])
2214
0
    {
2215
0
      unsigned int candBits0 = getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2216
0
      if (absCandQP[0] < absCandQP[1])
2217
0
      {
2218
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2219
0
        return candBits0QP < candBits0 ? candBits0QP : candBits0;
2220
0
      }
2221
0
      else
2222
0
      {
2223
0
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2224
0
        return candBits1QP < candBits0 ? candBits1QP : candBits0;
2225
0
      }
2226
0
    }
2227
408k
    else
2228
408k
    {
2229
408k
      unsigned int candBits1 = getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2230
408k
      if (absCandQP[0] < absCandQP[1])
2231
0
      {
2232
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2233
0
        return candBits0QP < candBits1 ? candBits0QP : candBits1;
2234
0
      }
2235
408k
      else
2236
408k
      {
2237
408k
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2238
18.4E
        return candBits1QP < candBits1 ? candBits1QP : candBits1;
2239
408k
      }
2240
408k
    }
2241
408k
  }
2242
707k
  else
2243
707k
  {
2244
707k
    if (absCand[0] < absCand[1])
2245
0
    {
2246
0
      return getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2247
0
    }
2248
707k
    else
2249
707k
    {
2250
707k
      return getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2251
707k
    }
2252
707k
  }
2253
1.11M
}
2254
2255
} // namespace vvenc
2256
2257
//! \}
2258