Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/RdCost.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     RdCost.cpp
45
    \brief    RD cost computation class
46
*/
47
48
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP
49
50
#include "RdCost.h"
51
#include "Rom.h"
52
#include "UnitPartitioner.h"
53
#include "SearchSpaceCounter.h"
54
55
56
//! \ingroup CommonLib
57
//! \{
58
59
namespace vvenc {
60
61
62
template<int csx>
63
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights );
64
65
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedWeight );
66
67
RdCost::RdCost()
68
0
  : m_afpDistortFunc{ { nullptr, }, { nullptr, } }
69
0
{
70
0
}
71
72
RdCost::~RdCost()
73
0
{
74
0
}
75
76
void RdCost::setLambda( double dLambda, const BitDepths &bitDepths )
77
0
{
78
0
  m_dLambda          = dLambda;
79
0
  m_DistScale        = double(1<<SCALE_BITS) / m_dLambda;
80
0
  m_dLambdaMotionSAD = sqrt(m_dLambda);
81
0
}
82
83
84
// Initialize Function Pointer by [eDFunc]
85
void RdCost::create( bool enableOpt )
86
0
{
87
0
  m_signalType                 = RESHAPE_SIGNAL_NULL;
88
0
  m_chromaWeight               = 1.0;
89
0
  m_lumaBD                     = 10;
90
0
  m_afpDistortFunc[0][DF_SSE    ] = RdCost::xGetSSE;
91
0
  m_afpDistortFunc[0][DF_SSE2   ] = RdCost::xGetSSE;
92
0
  m_afpDistortFunc[0][DF_SSE4   ] = RdCost::xGetSSE4;
93
0
  m_afpDistortFunc[0][DF_SSE8   ] = RdCost::xGetSSE8;
94
0
  m_afpDistortFunc[0][DF_SSE16  ] = RdCost::xGetSSE16;
95
0
  m_afpDistortFunc[0][DF_SSE32  ] = RdCost::xGetSSE32;
96
0
  m_afpDistortFunc[0][DF_SSE64  ] = RdCost::xGetSSE64;
97
0
  m_afpDistortFunc[0][DF_SSE128 ] = RdCost::xGetSSE128;
98
99
0
  m_afpDistortFunc[0][DF_SAD    ] = RdCost::xGetSAD;
100
0
  m_afpDistortFunc[0][DF_SAD2   ] = RdCost::xGetSAD;
101
0
  m_afpDistortFunc[0][DF_SAD4   ] = RdCost::xGetSAD4;
102
0
  m_afpDistortFunc[0][DF_SAD8   ] = RdCost::xGetSAD8;
103
0
  m_afpDistortFunc[0][DF_SAD16  ] = RdCost::xGetSAD16;
104
0
  m_afpDistortFunc[0][DF_SAD32  ] = RdCost::xGetSAD32;
105
0
  m_afpDistortFunc[0][DF_SAD64  ] = RdCost::xGetSAD64;
106
0
  m_afpDistortFunc[0][DF_SAD128 ] = RdCost::xGetSAD128;
107
108
0
  m_afpDistortFunc[0][DF_HAD    ] = RdCost::xGetHADs<false>;
109
0
  m_afpDistortFunc[0][DF_HAD2   ] = RdCost::xGetHADs<false>;
110
0
  m_afpDistortFunc[0][DF_HAD4   ] = RdCost::xGetHADs<false>;
111
0
  m_afpDistortFunc[0][DF_HAD8   ] = RdCost::xGetHADs<false>;
112
0
  m_afpDistortFunc[0][DF_HAD16  ] = RdCost::xGetHADs<false>;
113
0
  m_afpDistortFunc[0][DF_HAD32  ] = RdCost::xGetHADs<false>;
114
0
  m_afpDistortFunc[0][DF_HAD64  ] = RdCost::xGetHADs<false>;
115
0
  m_afpDistortFunc[0][DF_HAD128 ] = RdCost::xGetHADs<false>;
116
117
0
  m_afpDistortFunc[0][DF_HAD_fast    ] = RdCost::xGetHADs<true>;
118
0
  m_afpDistortFunc[0][DF_HAD2_fast   ] = RdCost::xGetHADs<true>;
119
0
  m_afpDistortFunc[0][DF_HAD4_fast   ] = RdCost::xGetHADs<true>;
120
0
  m_afpDistortFunc[0][DF_HAD8_fast   ] = RdCost::xGetHADs<true>;
121
0
  m_afpDistortFunc[0][DF_HAD16_fast  ] = RdCost::xGetHADs<true>;
122
0
  m_afpDistortFunc[0][DF_HAD32_fast  ] = RdCost::xGetHADs<true>;
123
0
  m_afpDistortFunc[0][DF_HAD64_fast  ] = RdCost::xGetHADs<true>;
124
0
  m_afpDistortFunc[0][DF_HAD128_fast ] = RdCost::xGetHADs<true>;
125
126
  //  m_afpDistortFunc[0][DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;
127
0
  m_afpDistortFunc[0][DF_HAD_2SAD ] = RdCost::xGetHAD2SADs;
128
129
0
  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = RdCost::xGetSADwMask;
130
  // m_afpDistortFunc[1] can be used in any case
131
0
  memcpy( m_afpDistortFunc[1], m_afpDistortFunc[0], sizeof(m_afpDistortFunc)/2);
132
133
0
  m_wtdPredPtr[0] = lumaWeightedSSE_Core<0>;
134
0
  m_wtdPredPtr[1] = lumaWeightedSSE_Core<1>;
135
0
  m_fxdWtdPredPtr = fixWeightedSSE_Core;
136
137
0
  m_afpDistortFuncX5[0] = RdCost::xGetSAD8X5;
138
0
  m_afpDistortFuncX5[1] = RdCost::xGetSAD16X5;
139
140
0
#if ENABLE_SIMD_OPT_DIST
141
0
  if( enableOpt )
142
0
  {
143
0
#ifdef TARGET_SIMD_X86
144
0
    initRdCostX86();
145
0
#endif
146
#ifdef TARGET_SIMD_ARM
147
    initRdCostARM();
148
#endif
149
0
  }
150
0
#endif
151
152
0
  m_costMode      = VVENC_COST_STANDARD_LOSSY;
153
0
  m_motionLambda  = 0;
154
0
  m_iCostScale    = 0;
155
0
}
156
157
#if ENABLE_MEASURE_SEARCH_SPACE
158
static Distortion xMeasurePredSearchSpaceInterceptor( const DistParam& dp )
159
{
160
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
161
  return dp.xDistFunc( dp );
162
}
163
164
#endif
165
void RdCost::setDistParam( DistParam &rcDP, const CPelBuf& org, const Pel* piRefY, int iRefStride, int bitDepth, ComponentID compID, int subShiftMode, int useHadamard )
166
0
{
167
0
  rcDP.bitDepth   = bitDepth;
168
0
  rcDP.compID     = compID;
169
170
  // set Original & Curr Pointer / Stride
171
0
  rcDP.org        = org;
172
173
0
  rcDP.cur.buf    = piRefY;
174
0
  rcDP.cur.stride = iRefStride;
175
176
  // set Block Width / Height
177
0
  rcDP.cur.width    = org.width;
178
0
  rcDP.cur.height   = org.height;
179
0
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
180
181
0
  const int base = (rcDP.bitDepth > 10 || rcDP.applyWeight) ? 1 : 0;
182
0
  if( !useHadamard )
183
0
  {
184
0
    rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( org.width ) ];
185
0
  }
186
0
  else
187
0
  {
188
0
    rcDP.distFunc = m_afpDistortFunc[base][( useHadamard == 1 ? DF_HAD : DF_HAD_fast ) + Log2( org.width ) ];
189
0
  }
190
191
  // initialize
192
0
  rcDP.subShift  = 0;
193
194
0
  if( subShiftMode == 1 )
195
0
  {
196
0
    if( rcDP.org.height > 8 && rcDP.org.width <= 128 )
197
0
    {
198
0
      rcDP.subShift = 1;
199
0
    }
200
0
  }
201
0
  else if( subShiftMode == 2 )
202
0
  {
203
0
    if (rcDP.org.height > 8)
204
0
    {
205
0
      rcDP.subShift = 1;
206
0
    }
207
0
  }
208
209
#if ENABLE_MEASURE_SEARCH_SPACE
210
  rcDP.xDistFunc = rcDP.distFunc;
211
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
212
#endif
213
0
}
214
215
216
DistParam RdCost::setDistParam( const CPelBuf& org, const CPelBuf& cur, int bitDepth, DFunc dfunc )
217
0
{
218
0
  int index = dfunc;
219
0
  if( dfunc != DF_HAD && dfunc != DF_HAD_fast && dfunc != DF_HAD_2SAD )
220
0
  {
221
0
    index += Log2(org.width);
222
0
  }
223
224
0
  const int base = bitDepth > 10 ? 1:0; //TBD: check does SDA ever overflow
225
#if ENABLE_MEASURE_SEARCH_SPACE
226
  DistParam rcDP( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
227
  rcDP.xDistFunc = rcDP.distFunc;
228
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
229
  return rcDP;
230
#else
231
0
  return DistParam( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
232
0
#endif
233
0
}
234
235
DistParam RdCost::setDistParam( const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShift, bool isDMVR )
236
0
{
237
0
  DistParam rcDP;
238
0
  rcDP.bitDepth   = bitDepth;
239
0
  rcDP.compID     = compID;
240
241
0
  rcDP.org.buf    = pOrg;
242
0
  rcDP.org.stride = iOrgStride;
243
0
  rcDP.org.width  = width;
244
0
  rcDP.org.height = height;
245
246
0
  rcDP.cur.buf    = piRefY;
247
0
  rcDP.cur.stride = iRefStride;
248
0
  rcDP.cur.width  = width;
249
0
  rcDP.cur.height = height;
250
0
  rcDP.subShift   = subShift;
251
252
  //  CHECK( useHadamard || rcDP.useMR, "only used in xDMVRCost with these default parameters (so far...)" );
253
0
  const int base = (rcDP.bitDepth > 10) ? 1 : 0;
254
255
0
  rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( width ) ];
256
  
257
0
  if( isDMVR )
258
0
  {
259
0
    rcDP.dmvrSadX5 = m_afpDistortFuncX5[Log2( width ) - 3];
260
0
  }
261
262
#if ENABLE_MEASURE_SEARCH_SPACE
263
  if( !isDMVR )
264
  {
265
    // DMVT is part of the decoder complexity
266
    rcDP.xDistFunc = rcDP.distFunc;
267
    rcDP.distFunc = xMeasurePredSearchSpaceInterceptor;
268
  }
269
270
#endif
271
0
  return rcDP;
272
0
}
273
274
Distortion RdCost::getDistPart( const CPelBuf& org, const CPelBuf& cur, int bitDepth, const ComponentID compId, DFunc eDFunc, const CPelBuf* orgLuma )
275
0
{
276
0
  DistParam dp( org, cur, nullptr, bitDepth, 0, compId );
277
# if ENABLE_MEASURE_SEARCH_SPACE
278
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
279
#endif
280
0
  Distortion dist;
281
0
  if( orgLuma )
282
0
  {
283
0
    CHECKD( eDFunc != DF_SSE_WTD, "mismatch func and parameter")
284
0
    dp.orgLuma  = orgLuma;
285
0
    dist = RdCost::xGetSSE_WTD( dp );
286
0
  }
287
0
  else
288
0
  {
289
0
    if( ( org.width == 1 ) )
290
0
    {
291
0
      dist = xGetSSE( dp );
292
0
    }
293
0
    else
294
0
    {
295
0
      const int base = (bitDepth > 10) ? 1 : 0;
296
0
      dist = m_afpDistortFunc[base][eDFunc + Log2(org.width)](dp);
297
0
    }
298
0
  }
299
0
  if (isChroma(compId))
300
0
  {
301
0
    return ((Distortion) (m_distortionWeight[ compId ] * dist));
302
0
  }
303
0
  else
304
0
  {
305
0
    return dist;
306
0
  }
307
0
}
308
309
// ====================================================================================================================
310
// Distortion functions
311
// ====================================================================================================================
312
313
// --------------------------------------------------------------------------------------------------------------------
314
// SAD
315
// --------------------------------------------------------------------------------------------------------------------
316
317
Distortion RdCost::xGetSAD( const DistParam& rcDtParam )
318
0
{
319
0
  if ( rcDtParam.applyWeight )
320
0
  {
321
0
    THROW(" no support");
322
0
  }
323
324
0
  const Pel* piOrg           = rcDtParam.org.buf;
325
0
  const Pel* piCur           = rcDtParam.cur.buf;
326
0
  const int  iCols           = rcDtParam.org.width;
327
0
        int  iRows           = rcDtParam.org.height;
328
0
  const int  iSubShift       = rcDtParam.subShift;
329
0
  const int  iSubStep        = ( 1 << iSubShift );
330
0
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
331
0
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
332
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
333
334
0
  Distortion uiSum = 0;
335
336
0
  for( ; iRows != 0; iRows -= iSubStep )
337
0
  {
338
0
    for (int n = 0; n < iCols; n++ )
339
0
    {
340
0
      uiSum += abs( piOrg[n] - piCur[n] );
341
0
    }
342
0
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
343
0
    {
344
0
      return ( uiSum >> distortionShift );
345
0
    }
346
0
    piOrg += iStrideOrg;
347
0
    piCur += iStrideCur;
348
0
  }
349
350
0
  uiSum <<= iSubShift;
351
0
  return ( uiSum >> distortionShift );
352
0
}
353
354
Distortion RdCost::xGetSAD4( const DistParam& rcDtParam )
355
0
{
356
0
  if ( rcDtParam.applyWeight )
357
0
  {
358
0
    THROW(" no support");
359
0
  }
360
361
0
  const Pel* piOrg   = rcDtParam.org.buf;
362
0
  const Pel* piCur   = rcDtParam.cur.buf;
363
0
  int  iRows         = rcDtParam.org.height;
364
0
  int  iSubShift     = rcDtParam.subShift;
365
0
  int  iSubStep      = ( 1 << iSubShift );
366
0
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
367
0
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;
368
369
0
  Distortion uiSum = 0;
370
371
0
  for( ; iRows != 0; iRows -= iSubStep )
372
0
  {
373
0
    uiSum += abs( piOrg[0] - piCur[0] );
374
0
    uiSum += abs( piOrg[1] - piCur[1] );
375
0
    uiSum += abs( piOrg[2] - piCur[2] );
376
0
    uiSum += abs( piOrg[3] - piCur[3] );
377
378
0
    piOrg += iStrideOrg;
379
0
    piCur += iStrideCur;
380
0
  }
381
382
0
  uiSum <<= iSubShift;
383
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
384
0
}
385
386
Distortion RdCost::xGetSAD8( const DistParam& rcDtParam )
387
0
{
388
0
  if ( rcDtParam.applyWeight )
389
0
  {
390
0
    THROW(" no support");
391
0
  }
392
393
0
  const Pel* piOrg      = rcDtParam.org.buf;
394
0
  const Pel* piCur      = rcDtParam.cur.buf;
395
0
  int  iRows            = rcDtParam.org.height;
396
0
  int  iSubShift        = rcDtParam.subShift;
397
0
  int  iSubStep         = ( 1 << iSubShift );
398
0
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
399
0
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
400
401
0
  Distortion uiSum = 0;
402
403
0
  for( ; iRows != 0; iRows-=iSubStep )
404
0
  {
405
0
    uiSum += abs( piOrg[0] - piCur[0] );
406
0
    uiSum += abs( piOrg[1] - piCur[1] );
407
0
    uiSum += abs( piOrg[2] - piCur[2] );
408
0
    uiSum += abs( piOrg[3] - piCur[3] );
409
0
    uiSum += abs( piOrg[4] - piCur[4] );
410
0
    uiSum += abs( piOrg[5] - piCur[5] );
411
0
    uiSum += abs( piOrg[6] - piCur[6] );
412
0
    uiSum += abs( piOrg[7] - piCur[7] );
413
414
0
    piOrg += iStrideOrg;
415
0
    piCur += iStrideCur;
416
0
  }
417
418
0
  uiSum <<= iSubShift;
419
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
420
0
}
421
422
Distortion RdCost::xGetSAD16( const DistParam& rcDtParam )
423
0
{
424
0
  if ( rcDtParam.applyWeight )
425
0
  {
426
0
    THROW(" no support");
427
0
  }
428
429
0
  const Pel* piOrg      = rcDtParam.org.buf;
430
0
  const Pel* piCur      = rcDtParam.cur.buf;
431
0
  int  iRows            = rcDtParam.org.height;
432
0
  int  iSubShift        = rcDtParam.subShift;
433
0
  int  iSubStep         = ( 1 << iSubShift );
434
0
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
435
0
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
436
437
0
  Distortion uiSum = 0;
438
439
0
  for( ; iRows != 0; iRows -= iSubStep )
440
0
  {
441
0
    uiSum += abs( piOrg[0] - piCur[0] );
442
0
    uiSum += abs( piOrg[1] - piCur[1] );
443
0
    uiSum += abs( piOrg[2] - piCur[2] );
444
0
    uiSum += abs( piOrg[3] - piCur[3] );
445
0
    uiSum += abs( piOrg[4] - piCur[4] );
446
0
    uiSum += abs( piOrg[5] - piCur[5] );
447
0
    uiSum += abs( piOrg[6] - piCur[6] );
448
0
    uiSum += abs( piOrg[7] - piCur[7] );
449
0
    uiSum += abs( piOrg[8] - piCur[8] );
450
0
    uiSum += abs( piOrg[9] - piCur[9] );
451
0
    uiSum += abs( piOrg[10] - piCur[10] );
452
0
    uiSum += abs( piOrg[11] - piCur[11] );
453
0
    uiSum += abs( piOrg[12] - piCur[12] );
454
0
    uiSum += abs( piOrg[13] - piCur[13] );
455
0
    uiSum += abs( piOrg[14] - piCur[14] );
456
0
    uiSum += abs( piOrg[15] - piCur[15] );
457
458
0
    piOrg += iStrideOrg;
459
0
    piCur += iStrideCur;
460
0
  }
461
462
0
  uiSum <<= iSubShift;
463
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
464
0
}
465
466
467
Distortion RdCost::xGetSAD128( const DistParam &rcDtParam )
468
0
{
469
0
  const Pel* piOrg  = rcDtParam.org.buf;
470
0
  const Pel* piCur  = rcDtParam.cur.buf;
471
0
  int  iRows        = rcDtParam.org.height;
472
0
  int  iCols        = rcDtParam.org.width;
473
0
  int  iSubShift    = rcDtParam.subShift;
474
0
  int  iSubStep     = ( 1 << iSubShift );
475
0
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
476
0
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;
477
478
0
  Distortion uiSum = 0;
479
480
0
  for( ; iRows != 0; iRows-=iSubStep )
481
0
  {
482
0
    for (int n = 0; n < iCols; n+=16 )
483
0
    {
484
0
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
485
0
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
486
0
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
487
0
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
488
0
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
489
0
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
490
0
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
491
0
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
492
0
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
493
0
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
494
0
      uiSum += abs( piOrg[n+10] - piCur[n+10] );
495
0
      uiSum += abs( piOrg[n+11] - piCur[n+11] );
496
0
      uiSum += abs( piOrg[n+12] - piCur[n+12] );
497
0
      uiSum += abs( piOrg[n+13] - piCur[n+13] );
498
0
      uiSum += abs( piOrg[n+14] - piCur[n+14] );
499
0
      uiSum += abs( piOrg[n+15] - piCur[n+15] );
500
0
    }
501
0
    piOrg += iStrideOrg;
502
0
    piCur += iStrideCur;
503
0
  }
504
505
0
  uiSum <<= iSubShift;
506
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
507
0
}
508
509
Distortion RdCost::xGetSAD32( const DistParam &rcDtParam )
510
0
{
511
0
  if ( rcDtParam.applyWeight )
512
0
  {
513
0
    THROW(" no support");
514
0
  }
515
516
0
  const Pel* piOrg      = rcDtParam.org.buf;
517
0
  const Pel* piCur      = rcDtParam.cur.buf;
518
0
  int  iRows            = rcDtParam.org.height;
519
0
  int  iSubShift        = rcDtParam.subShift;
520
0
  int  iSubStep         = ( 1 << iSubShift );
521
0
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
522
0
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
523
524
0
  Distortion uiSum = 0;
525
526
0
  for( ; iRows != 0; iRows-=iSubStep )
527
0
  {
528
0
    uiSum += abs( piOrg[0] - piCur[0] );
529
0
    uiSum += abs( piOrg[1] - piCur[1] );
530
0
    uiSum += abs( piOrg[2] - piCur[2] );
531
0
    uiSum += abs( piOrg[3] - piCur[3] );
532
0
    uiSum += abs( piOrg[4] - piCur[4] );
533
0
    uiSum += abs( piOrg[5] - piCur[5] );
534
0
    uiSum += abs( piOrg[6] - piCur[6] );
535
0
    uiSum += abs( piOrg[7] - piCur[7] );
536
0
    uiSum += abs( piOrg[8] - piCur[8] );
537
0
    uiSum += abs( piOrg[9] - piCur[9] );
538
0
    uiSum += abs( piOrg[10] - piCur[10] );
539
0
    uiSum += abs( piOrg[11] - piCur[11] );
540
0
    uiSum += abs( piOrg[12] - piCur[12] );
541
0
    uiSum += abs( piOrg[13] - piCur[13] );
542
0
    uiSum += abs( piOrg[14] - piCur[14] );
543
0
    uiSum += abs( piOrg[15] - piCur[15] );
544
0
    uiSum += abs( piOrg[16] - piCur[16] );
545
0
    uiSum += abs( piOrg[17] - piCur[17] );
546
0
    uiSum += abs( piOrg[18] - piCur[18] );
547
0
    uiSum += abs( piOrg[19] - piCur[19] );
548
0
    uiSum += abs( piOrg[20] - piCur[20] );
549
0
    uiSum += abs( piOrg[21] - piCur[21] );
550
0
    uiSum += abs( piOrg[22] - piCur[22] );
551
0
    uiSum += abs( piOrg[23] - piCur[23] );
552
0
    uiSum += abs( piOrg[24] - piCur[24] );
553
0
    uiSum += abs( piOrg[25] - piCur[25] );
554
0
    uiSum += abs( piOrg[26] - piCur[26] );
555
0
    uiSum += abs( piOrg[27] - piCur[27] );
556
0
    uiSum += abs( piOrg[28] - piCur[28] );
557
0
    uiSum += abs( piOrg[29] - piCur[29] );
558
0
    uiSum += abs( piOrg[30] - piCur[30] );
559
0
    uiSum += abs( piOrg[31] - piCur[31] );
560
561
0
    piOrg += iStrideOrg;
562
0
    piCur += iStrideCur;
563
0
  }
564
565
0
  uiSum <<= iSubShift;
566
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
567
0
}
568
569
570
Distortion RdCost::xGetSAD64( const DistParam &rcDtParam )
571
0
{
572
0
  if ( rcDtParam.applyWeight )
573
0
  {
574
0
    THROW(" no support");
575
0
  }
576
577
0
  const Pel* piOrg      = rcDtParam.org.buf;
578
0
  const Pel* piCur      = rcDtParam.cur.buf;
579
0
  int  iRows            = rcDtParam.org.height;
580
0
  int  iSubShift        = rcDtParam.subShift;
581
0
  int  iSubStep         = ( 1 << iSubShift );
582
0
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
583
0
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
584
585
0
  Distortion uiSum = 0;
586
587
0
  for( ; iRows != 0; iRows-=iSubStep )
588
0
  {
589
0
    uiSum += abs( piOrg[0] - piCur[0] );
590
0
    uiSum += abs( piOrg[1] - piCur[1] );
591
0
    uiSum += abs( piOrg[2] - piCur[2] );
592
0
    uiSum += abs( piOrg[3] - piCur[3] );
593
0
    uiSum += abs( piOrg[4] - piCur[4] );
594
0
    uiSum += abs( piOrg[5] - piCur[5] );
595
0
    uiSum += abs( piOrg[6] - piCur[6] );
596
0
    uiSum += abs( piOrg[7] - piCur[7] );
597
0
    uiSum += abs( piOrg[8] - piCur[8] );
598
0
    uiSum += abs( piOrg[9] - piCur[9] );
599
0
    uiSum += abs( piOrg[10] - piCur[10] );
600
0
    uiSum += abs( piOrg[11] - piCur[11] );
601
0
    uiSum += abs( piOrg[12] - piCur[12] );
602
0
    uiSum += abs( piOrg[13] - piCur[13] );
603
0
    uiSum += abs( piOrg[14] - piCur[14] );
604
0
    uiSum += abs( piOrg[15] - piCur[15] );
605
0
    uiSum += abs( piOrg[16] - piCur[16] );
606
0
    uiSum += abs( piOrg[17] - piCur[17] );
607
0
    uiSum += abs( piOrg[18] - piCur[18] );
608
0
    uiSum += abs( piOrg[19] - piCur[19] );
609
0
    uiSum += abs( piOrg[20] - piCur[20] );
610
0
    uiSum += abs( piOrg[21] - piCur[21] );
611
0
    uiSum += abs( piOrg[22] - piCur[22] );
612
0
    uiSum += abs( piOrg[23] - piCur[23] );
613
0
    uiSum += abs( piOrg[24] - piCur[24] );
614
0
    uiSum += abs( piOrg[25] - piCur[25] );
615
0
    uiSum += abs( piOrg[26] - piCur[26] );
616
0
    uiSum += abs( piOrg[27] - piCur[27] );
617
0
    uiSum += abs( piOrg[28] - piCur[28] );
618
0
    uiSum += abs( piOrg[29] - piCur[29] );
619
0
    uiSum += abs( piOrg[30] - piCur[30] );
620
0
    uiSum += abs( piOrg[31] - piCur[31] );
621
0
    uiSum += abs( piOrg[32] - piCur[32] );
622
0
    uiSum += abs( piOrg[33] - piCur[33] );
623
0
    uiSum += abs( piOrg[34] - piCur[34] );
624
0
    uiSum += abs( piOrg[35] - piCur[35] );
625
0
    uiSum += abs( piOrg[36] - piCur[36] );
626
0
    uiSum += abs( piOrg[37] - piCur[37] );
627
0
    uiSum += abs( piOrg[38] - piCur[38] );
628
0
    uiSum += abs( piOrg[39] - piCur[39] );
629
0
    uiSum += abs( piOrg[40] - piCur[40] );
630
0
    uiSum += abs( piOrg[41] - piCur[41] );
631
0
    uiSum += abs( piOrg[42] - piCur[42] );
632
0
    uiSum += abs( piOrg[43] - piCur[43] );
633
0
    uiSum += abs( piOrg[44] - piCur[44] );
634
0
    uiSum += abs( piOrg[45] - piCur[45] );
635
0
    uiSum += abs( piOrg[46] - piCur[46] );
636
0
    uiSum += abs( piOrg[47] - piCur[47] );
637
0
    uiSum += abs( piOrg[48] - piCur[48] );
638
0
    uiSum += abs( piOrg[49] - piCur[49] );
639
0
    uiSum += abs( piOrg[50] - piCur[50] );
640
0
    uiSum += abs( piOrg[51] - piCur[51] );
641
0
    uiSum += abs( piOrg[52] - piCur[52] );
642
0
    uiSum += abs( piOrg[53] - piCur[53] );
643
0
    uiSum += abs( piOrg[54] - piCur[54] );
644
0
    uiSum += abs( piOrg[55] - piCur[55] );
645
0
    uiSum += abs( piOrg[56] - piCur[56] );
646
0
    uiSum += abs( piOrg[57] - piCur[57] );
647
0
    uiSum += abs( piOrg[58] - piCur[58] );
648
0
    uiSum += abs( piOrg[59] - piCur[59] );
649
0
    uiSum += abs( piOrg[60] - piCur[60] );
650
0
    uiSum += abs( piOrg[61] - piCur[61] );
651
0
    uiSum += abs( piOrg[62] - piCur[62] );
652
0
    uiSum += abs( piOrg[63] - piCur[63] );
653
654
0
    piOrg += iStrideOrg;
655
0
    piCur += iStrideCur;
656
0
  }
657
658
0
  uiSum <<= iSubShift;
659
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
660
0
}
661
662
663
// --------------------------------------------------------------------------------------------------------------------
664
// SSE
665
// --------------------------------------------------------------------------------------------------------------------
666
667
Distortion RdCost::xGetSSE( const DistParam &rcDtParam )
668
0
{
669
0
  if ( rcDtParam.applyWeight )
670
0
  {
671
0
    THROW(" no support");
672
0
  }
673
674
0
  const Pel* piOrg      = rcDtParam.org.buf;
675
0
  const Pel* piCur      = rcDtParam.cur.buf;
676
0
  int  iRows            = rcDtParam.org.height;
677
0
  int  iCols            = rcDtParam.org.width;
678
0
  int  iStrideCur       = rcDtParam.cur.stride;
679
0
  int  iStrideOrg       = rcDtParam.org.stride;
680
681
0
  Distortion uiSum   = 0;
682
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
683
684
0
  Intermediate_Int iTemp;
685
686
0
  for( ; iRows != 0; iRows-- )
687
0
  {
688
0
    for (int n = 0; n < iCols; n++ )
689
0
    {
690
0
      iTemp = piOrg[n  ] - piCur[n  ];
691
0
      uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
692
0
    }
693
0
    piOrg += iStrideOrg;
694
0
    piCur += iStrideCur;
695
0
  }
696
697
0
  return ( uiSum );
698
0
}
699
700
Distortion RdCost::xGetSSE4( const DistParam &rcDtParam )
701
0
{
702
0
  if ( rcDtParam.applyWeight )
703
0
  {
704
0
    CHECK( rcDtParam.org.width != 4, "Invalid size" );
705
0
    THROW(" no support");
706
0
  }
707
708
0
  const Pel* piOrg   = rcDtParam.org.buf;
709
0
  const Pel* piCur   = rcDtParam.cur.buf;
710
0
  int  iRows         = rcDtParam.org.height;
711
0
  int  iStrideOrg    = rcDtParam.org.stride;
712
0
  int  iStrideCur    = rcDtParam.cur.stride;
713
714
0
  Distortion uiSum   = 0;
715
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
716
717
0
  Intermediate_Int  iTemp;
718
719
0
  for( ; iRows != 0; iRows-- )
720
0
  {
721
722
0
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
723
0
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
724
0
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
725
0
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
726
727
0
    piOrg += iStrideOrg;
728
0
    piCur += iStrideCur;
729
0
  }
730
731
0
  return ( uiSum );
732
0
}
733
734
Distortion RdCost::xGetSSE8( const DistParam &rcDtParam )
735
0
{
736
0
  if ( rcDtParam.applyWeight )
737
0
  {
738
0
    CHECK( rcDtParam.org.width != 8, "Invalid size" );
739
0
    THROW(" no support");
740
0
  }
741
742
0
  const Pel* piOrg   = rcDtParam.org.buf;
743
0
  const Pel* piCur   = rcDtParam.cur.buf;
744
0
  int  iRows         = rcDtParam.org.height;
745
0
  int  iStrideOrg    = rcDtParam.org.stride;
746
0
  int  iStrideCur    = rcDtParam.cur.stride;
747
748
0
  Distortion uiSum   = 0;
749
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
750
751
0
  Intermediate_Int  iTemp;
752
753
0
  for( ; iRows != 0; iRows-- )
754
0
  {
755
0
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
756
0
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
757
0
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
758
0
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
759
0
    iTemp = piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
760
0
    iTemp = piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
761
0
    iTemp = piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
762
0
    iTemp = piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
763
764
0
    piOrg += iStrideOrg;
765
0
    piCur += iStrideCur;
766
0
  }
767
768
0
  return ( uiSum );
769
0
}
770
771
Distortion RdCost::xGetSSE16( const DistParam &rcDtParam )
772
0
{
773
0
  if ( rcDtParam.applyWeight )
774
0
  {
775
0
    CHECK( rcDtParam.org.width != 16, "Invalid size" );
776
0
    THROW(" no support");
777
0
  }
778
779
0
  const Pel* piOrg   = rcDtParam.org.buf;
780
0
  const Pel* piCur   = rcDtParam.cur.buf;
781
0
  int  iRows         = rcDtParam.org.height;
782
0
  int  iStrideOrg    = rcDtParam.org.stride;
783
0
  int  iStrideCur    = rcDtParam.cur.stride;
784
785
0
  Distortion uiSum   = 0;
786
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
787
788
0
  Intermediate_Int  iTemp;
789
790
0
  for( ; iRows != 0; iRows-- )
791
0
  {
792
793
0
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
794
0
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
795
0
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
796
0
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
797
0
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
798
0
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
799
0
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
800
0
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
801
0
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
802
0
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
803
0
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
804
0
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
805
0
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
806
0
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
807
0
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
808
0
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
809
810
0
    piOrg += iStrideOrg;
811
0
    piCur += iStrideCur;
812
0
  }
813
814
0
  return ( uiSum );
815
0
}
816
817
Distortion RdCost::xGetSSE128( const DistParam &rcDtParam )
818
0
{
819
0
  if ( rcDtParam.applyWeight )
820
0
  {
821
0
    THROW(" no support");
822
0
  }
823
0
  const Pel* piOrg   = rcDtParam.org.buf;
824
0
  const Pel* piCur   = rcDtParam.cur.buf;
825
0
  int  iRows         = rcDtParam.org.height;
826
0
  int  iCols         = rcDtParam.org.width;
827
0
  int  iStrideOrg    = rcDtParam.org.stride;
828
0
  int  iStrideCur    = rcDtParam.cur.stride;
829
830
0
  Distortion uiSum   = 0;
831
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
832
833
0
  Intermediate_Int  iTemp;
834
835
0
  for( ; iRows != 0; iRows-- )
836
0
  {
837
0
    for (int n = 0; n < iCols; n+=16 )
838
0
    {
839
840
0
      iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
841
0
      iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
842
0
      iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
843
0
      iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
844
0
      iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
845
0
      iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
846
0
      iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
847
0
      iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
848
0
      iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
849
0
      iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
850
0
      iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
851
0
      iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
852
0
      iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
853
0
      iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
854
0
      iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
855
0
      iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
856
857
0
    }
858
0
    piOrg += iStrideOrg;
859
0
    piCur += iStrideCur;
860
0
  }
861
862
0
  return ( uiSum );
863
0
}
864
865
Distortion RdCost::xGetSSE32( const DistParam &rcDtParam )
866
0
{
867
0
  if ( rcDtParam.applyWeight )
868
0
  {
869
0
    THROW(" no support");
870
0
  }
871
872
0
  const Pel* piOrg   = rcDtParam.org.buf;
873
0
  const Pel* piCur   = rcDtParam.cur.buf;
874
0
  int  iRows         = rcDtParam.org.height;
875
0
  int  iStrideOrg    = rcDtParam.org.stride;
876
0
  int  iStrideCur    = rcDtParam.cur.stride;
877
878
0
  Distortion uiSum   = 0;
879
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
880
881
0
  Intermediate_Int  iTemp;
882
883
0
  for( ; iRows != 0; iRows-- )
884
0
  {
885
886
0
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
887
0
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
888
0
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
889
0
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
890
0
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
891
0
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
892
0
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
893
0
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
894
0
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
895
0
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
896
0
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
897
0
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
898
0
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
899
0
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
900
0
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
901
0
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
902
0
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
903
0
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
904
0
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
905
0
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
906
0
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
907
0
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
908
0
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
909
0
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
910
0
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
911
0
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
912
0
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
913
0
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
914
0
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
915
0
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
916
0
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
917
0
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
918
919
0
    piOrg += iStrideOrg;
920
0
    piCur += iStrideCur;
921
0
  }
922
923
0
  return ( uiSum );
924
0
}
925
926
Distortion RdCost::xGetSSE64( const DistParam &rcDtParam )
927
0
{
928
0
  if ( rcDtParam.applyWeight )
929
0
  {
930
0
    THROW(" no support");
931
0
  }
932
933
0
  const Pel* piOrg   = rcDtParam.org.buf;
934
0
  const Pel* piCur   = rcDtParam.cur.buf;
935
0
  int  iRows         = rcDtParam.org.height;
936
0
  int  iStrideOrg    = rcDtParam.org.stride;
937
0
  int  iStrideCur    = rcDtParam.cur.stride;
938
939
0
  Distortion uiSum   = 0;
940
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
941
942
0
  Intermediate_Int  iTemp;
943
944
0
  for( ; iRows != 0; iRows-- )
945
0
  {
946
0
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
947
0
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
948
0
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
949
0
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
950
0
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
951
0
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
952
0
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
953
0
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
954
0
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
955
0
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
956
0
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
957
0
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
958
0
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
959
0
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
960
0
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
961
0
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
962
0
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
963
0
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
964
0
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
965
0
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
966
0
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
967
0
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
968
0
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
969
0
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
970
0
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
971
0
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
972
0
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
973
0
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
974
0
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
975
0
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
976
0
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
977
0
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
978
0
    iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
979
0
    iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
980
0
    iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
981
0
    iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
982
0
    iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
983
0
    iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
984
0
    iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
985
0
    iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
986
0
    iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
987
0
    iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
988
0
    iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
989
0
    iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
990
0
    iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
991
0
    iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
992
0
    iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
993
0
    iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
994
0
    iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
995
0
    iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
996
0
    iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
997
0
    iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
998
0
    iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
999
0
    iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1000
0
    iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1001
0
    iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1002
0
    iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1003
0
    iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1004
0
    iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1005
0
    iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1006
0
    iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1007
0
    iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1008
0
    iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1009
0
    iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1010
1011
0
    piOrg += iStrideOrg;
1012
0
    piCur += iStrideCur;
1013
0
  }
1014
1015
0
  return ( uiSum );
1016
0
}
1017
1018
// --------------------------------------------------------------------------------------------------------------------
1019
// HADAMARD with step (used in fractional search)
1020
// --------------------------------------------------------------------------------------------------------------------
1021
1022
Distortion RdCost::xCalcHADs2x2( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1023
0
{
1024
0
  Distortion satd = 0;
1025
0
  TCoeff diff[4], m[4];
1026
1027
0
  diff[0] = piOrg[0             ] - piCur[0];
1028
0
  diff[1] = piOrg[1             ] - piCur[1];
1029
0
  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
1030
0
  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
1031
0
  m[0] = diff[0] + diff[2];
1032
0
  m[1] = diff[1] + diff[3];
1033
0
  m[2] = diff[0] - diff[2];
1034
0
  m[3] = diff[1] - diff[3];
1035
  
1036
0
  satd += abs(m[0] + m[1]) >> 2;
1037
0
  satd += abs(m[0] - m[1]);
1038
0
  satd += abs(m[2] + m[3]);
1039
0
  satd += abs(m[2] - m[3]);
1040
1041
0
  return satd;
1042
0
}
1043
1044
static Distortion xCalcHADs4x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1045
0
{
1046
0
  int k;
1047
0
  Distortion satd = 0;
1048
0
  TCoeff diff[16], m[16], d[16];
1049
1050
0
  for( k = 0; k < 16; k+=4 )
1051
0
  {
1052
0
    diff[k+0] = piOrg[0] - piCur[0];
1053
0
    diff[k+1] = piOrg[1] - piCur[1];
1054
0
    diff[k+2] = piOrg[2] - piCur[2];
1055
0
    diff[k+3] = piOrg[3] - piCur[3];
1056
1057
0
    piCur += iStrideCur;
1058
0
    piOrg += iStrideOrg;
1059
0
  }
1060
1061
  /*===== hadamard transform =====*/
1062
0
  m[ 0] = diff[ 0] + diff[12];
1063
0
  m[ 1] = diff[ 1] + diff[13];
1064
0
  m[ 2] = diff[ 2] + diff[14];
1065
0
  m[ 3] = diff[ 3] + diff[15];
1066
0
  m[ 4] = diff[ 4] + diff[ 8];
1067
0
  m[ 5] = diff[ 5] + diff[ 9];
1068
0
  m[ 6] = diff[ 6] + diff[10];
1069
0
  m[ 7] = diff[ 7] + diff[11];
1070
0
  m[ 8] = diff[ 4] - diff[ 8];
1071
0
  m[ 9] = diff[ 5] - diff[ 9];
1072
0
  m[10] = diff[ 6] - diff[10];
1073
0
  m[11] = diff[ 7] - diff[11];
1074
0
  m[12] = diff[ 0] - diff[12];
1075
0
  m[13] = diff[ 1] - diff[13];
1076
0
  m[14] = diff[ 2] - diff[14];
1077
0
  m[15] = diff[ 3] - diff[15];
1078
1079
0
  d[ 0] = m[ 0] + m[ 4];
1080
0
  d[ 1] = m[ 1] + m[ 5];
1081
0
  d[ 2] = m[ 2] + m[ 6];
1082
0
  d[ 3] = m[ 3] + m[ 7];
1083
0
  d[ 4] = m[ 8] + m[12];
1084
0
  d[ 5] = m[ 9] + m[13];
1085
0
  d[ 6] = m[10] + m[14];
1086
0
  d[ 7] = m[11] + m[15];
1087
0
  d[ 8] = m[ 0] - m[ 4];
1088
0
  d[ 9] = m[ 1] - m[ 5];
1089
0
  d[10] = m[ 2] - m[ 6];
1090
0
  d[11] = m[ 3] - m[ 7];
1091
0
  d[12] = m[12] - m[ 8];
1092
0
  d[13] = m[13] - m[ 9];
1093
0
  d[14] = m[14] - m[10];
1094
0
  d[15] = m[15] - m[11];
1095
1096
0
  m[ 0] = d[ 0] + d[ 3];
1097
0
  m[ 1] = d[ 1] + d[ 2];
1098
0
  m[ 2] = d[ 1] - d[ 2];
1099
0
  m[ 3] = d[ 0] - d[ 3];
1100
0
  m[ 4] = d[ 4] + d[ 7];
1101
0
  m[ 5] = d[ 5] + d[ 6];
1102
0
  m[ 6] = d[ 5] - d[ 6];
1103
0
  m[ 7] = d[ 4] - d[ 7];
1104
0
  m[ 8] = d[ 8] + d[11];
1105
0
  m[ 9] = d[ 9] + d[10];
1106
0
  m[10] = d[ 9] - d[10];
1107
0
  m[11] = d[ 8] - d[11];
1108
0
  m[12] = d[12] + d[15];
1109
0
  m[13] = d[13] + d[14];
1110
0
  m[14] = d[13] - d[14];
1111
0
  m[15] = d[12] - d[15];
1112
1113
0
  d[ 0] = m[ 0] + m[ 1];
1114
0
  d[ 1] = m[ 0] - m[ 1];
1115
0
  d[ 2] = m[ 2] + m[ 3];
1116
0
  d[ 3] = m[ 3] - m[ 2];
1117
0
  d[ 4] = m[ 4] + m[ 5];
1118
0
  d[ 5] = m[ 4] - m[ 5];
1119
0
  d[ 6] = m[ 6] + m[ 7];
1120
0
  d[ 7] = m[ 7] - m[ 6];
1121
0
  d[ 8] = m[ 8] + m[ 9];
1122
0
  d[ 9] = m[ 8] - m[ 9];
1123
0
  d[10] = m[10] + m[11];
1124
0
  d[11] = m[11] - m[10];
1125
0
  d[12] = m[12] + m[13];
1126
0
  d[13] = m[12] - m[13];
1127
0
  d[14] = m[14] + m[15];
1128
0
  d[15] = m[15] - m[14];
1129
1130
0
  for (k=0; k<16; ++k)
1131
0
  {
1132
0
    satd += abs(d[k]);
1133
0
  }
1134
1135
0
  satd -= abs( d[0] );
1136
0
  satd += abs( d[0] ) >> 2;
1137
0
  satd = ((satd+1)>>1);
1138
1139
0
  return satd;
1140
0
}
1141
1142
static Distortion xCalcHADs16x16_fast( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1143
0
{
1144
0
  int k, i, j, jj;
1145
0
  Distortion sad = 0;
1146
0
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1147
1148
0
  for( k = 0; k < 64; k += 8 )
1149
0
  {
1150
0
    diff[k+0] = ( ( piOrg[ 0] + piOrg[ 0+1] + piOrg[ 0+iStrideOrg] + piOrg[ 0+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 0] + piCur[ 0+1] + piCur[ 0+iStrideCur] + piCur[ 0+1+iStrideCur] + 2 ) >> 2 );
1151
0
    diff[k+1] = ( ( piOrg[ 2] + piOrg[ 2+1] + piOrg[ 2+iStrideOrg] + piOrg[ 2+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 2] + piCur[ 2+1] + piCur[ 2+iStrideCur] + piCur[ 2+1+iStrideCur] + 2 ) >> 2 );
1152
0
    diff[k+2] = ( ( piOrg[ 4] + piOrg[ 4+1] + piOrg[ 4+iStrideOrg] + piOrg[ 4+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 4] + piCur[ 4+1] + piCur[ 4+iStrideCur] + piCur[ 4+1+iStrideCur] + 2 ) >> 2 );
1153
0
    diff[k+3] = ( ( piOrg[ 6] + piOrg[ 6+1] + piOrg[ 6+iStrideOrg] + piOrg[ 6+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 6] + piCur[ 6+1] + piCur[ 6+iStrideCur] + piCur[ 6+1+iStrideCur] + 2 ) >> 2 );
1154
0
    diff[k+4] = ( ( piOrg[ 8] + piOrg[ 8+1] + piOrg[ 8+iStrideOrg] + piOrg[ 8+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 8] + piCur[ 8+1] + piCur[ 8+iStrideCur] + piCur[ 8+1+iStrideCur] + 2 ) >> 2 );
1155
0
    diff[k+5] = ( ( piOrg[10] + piOrg[10+1] + piOrg[10+iStrideOrg] + piOrg[10+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[10] + piCur[10+1] + piCur[10+iStrideCur] + piCur[10+1+iStrideCur] + 2 ) >> 2 );
1156
0
    diff[k+6] = ( ( piOrg[12] + piOrg[12+1] + piOrg[12+iStrideOrg] + piOrg[12+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[12] + piCur[12+1] + piCur[12+iStrideCur] + piCur[12+1+iStrideCur] + 2 ) >> 2 );
1157
0
    diff[k+7] = ( ( piOrg[14] + piOrg[14+1] + piOrg[14+iStrideOrg] + piOrg[14+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[14] + piCur[14+1] + piCur[14+iStrideCur] + piCur[14+1+iStrideCur] + 2 ) >> 2 );
1158
1159
0
    piCur += 2 * iStrideCur;
1160
0
    piOrg += 2 * iStrideOrg;
1161
0
  }
1162
1163
  //horizontal
1164
0
  for (j=0; j < 8; j++)
1165
0
  {
1166
0
    jj = j << 3;
1167
0
    m2[j][0] = diff[jj  ] + diff[jj+4];
1168
0
    m2[j][1] = diff[jj+1] + diff[jj+5];
1169
0
    m2[j][2] = diff[jj+2] + diff[jj+6];
1170
0
    m2[j][3] = diff[jj+3] + diff[jj+7];
1171
0
    m2[j][4] = diff[jj  ] - diff[jj+4];
1172
0
    m2[j][5] = diff[jj+1] - diff[jj+5];
1173
0
    m2[j][6] = diff[jj+2] - diff[jj+6];
1174
0
    m2[j][7] = diff[jj+3] - diff[jj+7];
1175
1176
0
    m1[j][0] = m2[j][0] + m2[j][2];
1177
0
    m1[j][1] = m2[j][1] + m2[j][3];
1178
0
    m1[j][2] = m2[j][0] - m2[j][2];
1179
0
    m1[j][3] = m2[j][1] - m2[j][3];
1180
0
    m1[j][4] = m2[j][4] + m2[j][6];
1181
0
    m1[j][5] = m2[j][5] + m2[j][7];
1182
0
    m1[j][6] = m2[j][4] - m2[j][6];
1183
0
    m1[j][7] = m2[j][5] - m2[j][7];
1184
1185
0
    m2[j][0] = m1[j][0] + m1[j][1];
1186
0
    m2[j][1] = m1[j][0] - m1[j][1];
1187
0
    m2[j][2] = m1[j][2] + m1[j][3];
1188
0
    m2[j][3] = m1[j][2] - m1[j][3];
1189
0
    m2[j][4] = m1[j][4] + m1[j][5];
1190
0
    m2[j][5] = m1[j][4] - m1[j][5];
1191
0
    m2[j][6] = m1[j][6] + m1[j][7];
1192
0
    m2[j][7] = m1[j][6] - m1[j][7];
1193
0
  }
1194
1195
  //vertical
1196
0
  for (i=0; i < 8; i++)
1197
0
  {
1198
0
    m3[0][i] = m2[0][i] + m2[4][i];
1199
0
    m3[1][i] = m2[1][i] + m2[5][i];
1200
0
    m3[2][i] = m2[2][i] + m2[6][i];
1201
0
    m3[3][i] = m2[3][i] + m2[7][i];
1202
0
    m3[4][i] = m2[0][i] - m2[4][i];
1203
0
    m3[5][i] = m2[1][i] - m2[5][i];
1204
0
    m3[6][i] = m2[2][i] - m2[6][i];
1205
0
    m3[7][i] = m2[3][i] - m2[7][i];
1206
1207
0
    m1[0][i] = m3[0][i] + m3[2][i];
1208
0
    m1[1][i] = m3[1][i] + m3[3][i];
1209
0
    m1[2][i] = m3[0][i] - m3[2][i];
1210
0
    m1[3][i] = m3[1][i] - m3[3][i];
1211
0
    m1[4][i] = m3[4][i] + m3[6][i];
1212
0
    m1[5][i] = m3[5][i] + m3[7][i];
1213
0
    m1[6][i] = m3[4][i] - m3[6][i];
1214
0
    m1[7][i] = m3[5][i] - m3[7][i];
1215
1216
0
    m2[0][i] = m1[0][i] + m1[1][i];
1217
0
    m2[1][i] = m1[0][i] - m1[1][i];
1218
0
    m2[2][i] = m1[2][i] + m1[3][i];
1219
0
    m2[3][i] = m1[2][i] - m1[3][i];
1220
0
    m2[4][i] = m1[4][i] + m1[5][i];
1221
0
    m2[5][i] = m1[4][i] - m1[5][i];
1222
0
    m2[6][i] = m1[6][i] + m1[7][i];
1223
0
    m2[7][i] = m1[6][i] - m1[7][i];
1224
0
  }
1225
1226
0
  for (i = 0; i < 8; i++)
1227
0
  {
1228
0
    for (j = 0; j < 8; j++)
1229
0
    {
1230
0
      sad += abs(m2[i][j]);
1231
0
    }
1232
0
  }
1233
  
1234
0
  sad -= abs( m2[0][0] );
1235
0
  sad += abs( m2[0][0] ) >> 2;
1236
0
  sad=((sad+2)>>2);
1237
1238
0
  return (sad << 2);
1239
0
}
1240
1241
static Distortion xCalcHADs8x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1242
0
{
1243
0
  int k, i, j, jj;
1244
0
  Distortion sad = 0;
1245
0
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1246
1247
0
  for( k = 0; k < 64; k += 8 )
1248
0
  {
1249
0
    diff[k+0] = piOrg[0] - piCur[0];
1250
0
    diff[k+1] = piOrg[1] - piCur[1];
1251
0
    diff[k+2] = piOrg[2] - piCur[2];
1252
0
    diff[k+3] = piOrg[3] - piCur[3];
1253
0
    diff[k+4] = piOrg[4] - piCur[4];
1254
0
    diff[k+5] = piOrg[5] - piCur[5];
1255
0
    diff[k+6] = piOrg[6] - piCur[6];
1256
0
    diff[k+7] = piOrg[7] - piCur[7];
1257
1258
0
    piCur += iStrideCur;
1259
0
    piOrg += iStrideOrg;
1260
0
  }
1261
1262
  //horizontal
1263
0
  for (j=0; j < 8; j++)
1264
0
  {
1265
0
    jj = j << 3;
1266
0
    m2[j][0] = diff[jj  ] + diff[jj+4];
1267
0
    m2[j][1] = diff[jj+1] + diff[jj+5];
1268
0
    m2[j][2] = diff[jj+2] + diff[jj+6];
1269
0
    m2[j][3] = diff[jj+3] + diff[jj+7];
1270
0
    m2[j][4] = diff[jj  ] - diff[jj+4];
1271
0
    m2[j][5] = diff[jj+1] - diff[jj+5];
1272
0
    m2[j][6] = diff[jj+2] - diff[jj+6];
1273
0
    m2[j][7] = diff[jj+3] - diff[jj+7];
1274
1275
0
    m1[j][0] = m2[j][0] + m2[j][2];
1276
0
    m1[j][1] = m2[j][1] + m2[j][3];
1277
0
    m1[j][2] = m2[j][0] - m2[j][2];
1278
0
    m1[j][3] = m2[j][1] - m2[j][3];
1279
0
    m1[j][4] = m2[j][4] + m2[j][6];
1280
0
    m1[j][5] = m2[j][5] + m2[j][7];
1281
0
    m1[j][6] = m2[j][4] - m2[j][6];
1282
0
    m1[j][7] = m2[j][5] - m2[j][7];
1283
1284
0
    m2[j][0] = m1[j][0] + m1[j][1];
1285
0
    m2[j][1] = m1[j][0] - m1[j][1];
1286
0
    m2[j][2] = m1[j][2] + m1[j][3];
1287
0
    m2[j][3] = m1[j][2] - m1[j][3];
1288
0
    m2[j][4] = m1[j][4] + m1[j][5];
1289
0
    m2[j][5] = m1[j][4] - m1[j][5];
1290
0
    m2[j][6] = m1[j][6] + m1[j][7];
1291
0
    m2[j][7] = m1[j][6] - m1[j][7];
1292
0
  }
1293
1294
  //vertical
1295
0
  for (i=0; i < 8; i++)
1296
0
  {
1297
0
    m3[0][i] = m2[0][i] + m2[4][i];
1298
0
    m3[1][i] = m2[1][i] + m2[5][i];
1299
0
    m3[2][i] = m2[2][i] + m2[6][i];
1300
0
    m3[3][i] = m2[3][i] + m2[7][i];
1301
0
    m3[4][i] = m2[0][i] - m2[4][i];
1302
0
    m3[5][i] = m2[1][i] - m2[5][i];
1303
0
    m3[6][i] = m2[2][i] - m2[6][i];
1304
0
    m3[7][i] = m2[3][i] - m2[7][i];
1305
1306
0
    m1[0][i] = m3[0][i] + m3[2][i];
1307
0
    m1[1][i] = m3[1][i] + m3[3][i];
1308
0
    m1[2][i] = m3[0][i] - m3[2][i];
1309
0
    m1[3][i] = m3[1][i] - m3[3][i];
1310
0
    m1[4][i] = m3[4][i] + m3[6][i];
1311
0
    m1[5][i] = m3[5][i] + m3[7][i];
1312
0
    m1[6][i] = m3[4][i] - m3[6][i];
1313
0
    m1[7][i] = m3[5][i] - m3[7][i];
1314
1315
0
    m2[0][i] = m1[0][i] + m1[1][i];
1316
0
    m2[1][i] = m1[0][i] - m1[1][i];
1317
0
    m2[2][i] = m1[2][i] + m1[3][i];
1318
0
    m2[3][i] = m1[2][i] - m1[3][i];
1319
0
    m2[4][i] = m1[4][i] + m1[5][i];
1320
0
    m2[5][i] = m1[4][i] - m1[5][i];
1321
0
    m2[6][i] = m1[6][i] + m1[7][i];
1322
0
    m2[7][i] = m1[6][i] - m1[7][i];
1323
0
  }
1324
1325
0
  for (i = 0; i < 8; i++)
1326
0
  {
1327
0
    for (j = 0; j < 8; j++)
1328
0
    {
1329
0
      sad += abs(m2[i][j]);
1330
0
    }
1331
0
  }
1332
  
1333
0
  sad -= abs( m2[0][0] );
1334
0
  sad += abs( m2[0][0] ) >> 2;
1335
0
  sad=((sad+2)>>2);
1336
1337
0
  return sad;
1338
0
}
1339
1340
static Distortion xCalcHADs16x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1341
0
{   //need to add SIMD implementation ,JCA
1342
0
  int k, i, j, jj, sad = 0;
1343
0
  int diff[128], m1[8][16], m2[8][16];
1344
0
  for( k = 0; k < 128; k += 16 )
1345
0
  {
1346
0
    diff[k + 0] = piOrg[0] - piCur[0];
1347
0
    diff[k + 1] = piOrg[1] - piCur[1];
1348
0
    diff[k + 2] = piOrg[2] - piCur[2];
1349
0
    diff[k + 3] = piOrg[3] - piCur[3];
1350
0
    diff[k + 4] = piOrg[4] - piCur[4];
1351
0
    diff[k + 5] = piOrg[5] - piCur[5];
1352
0
    diff[k + 6] = piOrg[6] - piCur[6];
1353
0
    diff[k + 7] = piOrg[7] - piCur[7];
1354
1355
0
    diff[k + 8] = piOrg[8] - piCur[8];
1356
0
    diff[k + 9] = piOrg[9] - piCur[9];
1357
0
    diff[k + 10] = piOrg[10] - piCur[10];
1358
0
    diff[k + 11] = piOrg[11] - piCur[11];
1359
0
    diff[k + 12] = piOrg[12] - piCur[12];
1360
0
    diff[k + 13] = piOrg[13] - piCur[13];
1361
0
    diff[k + 14] = piOrg[14] - piCur[14];
1362
0
    diff[k + 15] = piOrg[15] - piCur[15];
1363
1364
0
    piCur += iStrideCur;
1365
0
    piOrg += iStrideOrg;
1366
0
  }
1367
1368
  //horizontal
1369
0
  for( j = 0; j < 8; j++ )
1370
0
  {
1371
0
    jj = j << 4;
1372
1373
0
    m2[j][0] = diff[jj    ] + diff[jj + 8];
1374
0
    m2[j][1] = diff[jj + 1] + diff[jj + 9];
1375
0
    m2[j][2] = diff[jj + 2] + diff[jj + 10];
1376
0
    m2[j][3] = diff[jj + 3] + diff[jj + 11];
1377
0
    m2[j][4] = diff[jj + 4] + diff[jj + 12];
1378
0
    m2[j][5] = diff[jj + 5] + diff[jj + 13];
1379
0
    m2[j][6] = diff[jj + 6] + diff[jj + 14];
1380
0
    m2[j][7] = diff[jj + 7] + diff[jj + 15];
1381
0
    m2[j][8] = diff[jj    ] - diff[jj + 8];
1382
0
    m2[j][9] = diff[jj + 1] - diff[jj + 9];
1383
0
    m2[j][10] = diff[jj + 2] - diff[jj + 10];
1384
0
    m2[j][11] = diff[jj + 3] - diff[jj + 11];
1385
0
    m2[j][12] = diff[jj + 4] - diff[jj + 12];
1386
0
    m2[j][13] = diff[jj + 5] - diff[jj + 13];
1387
0
    m2[j][14] = diff[jj + 6] - diff[jj + 14];
1388
0
    m2[j][15] = diff[jj + 7] - diff[jj + 15];
1389
1390
0
    m1[j][0] = m2[j][0] + m2[j][4];
1391
0
    m1[j][1] = m2[j][1] + m2[j][5];
1392
0
    m1[j][2] = m2[j][2] + m2[j][6];
1393
0
    m1[j][3] = m2[j][3] + m2[j][7];
1394
0
    m1[j][4] = m2[j][0] - m2[j][4];
1395
0
    m1[j][5] = m2[j][1] - m2[j][5];
1396
0
    m1[j][6] = m2[j][2] - m2[j][6];
1397
0
    m1[j][7] = m2[j][3] - m2[j][7];
1398
0
    m1[j][8] = m2[j][8] + m2[j][12];
1399
0
    m1[j][9] = m2[j][9] + m2[j][13];
1400
0
    m1[j][10] = m2[j][10] + m2[j][14];
1401
0
    m1[j][11] = m2[j][11] + m2[j][15];
1402
0
    m1[j][12] = m2[j][8] - m2[j][12];
1403
0
    m1[j][13] = m2[j][9] - m2[j][13];
1404
0
    m1[j][14] = m2[j][10] - m2[j][14];
1405
0
    m1[j][15] = m2[j][11] - m2[j][15];
1406
1407
0
    m2[j][0] = m1[j][0] + m1[j][2];
1408
0
    m2[j][1] = m1[j][1] + m1[j][3];
1409
0
    m2[j][2] = m1[j][0] - m1[j][2];
1410
0
    m2[j][3] = m1[j][1] - m1[j][3];
1411
0
    m2[j][4] = m1[j][4] + m1[j][6];
1412
0
    m2[j][5] = m1[j][5] + m1[j][7];
1413
0
    m2[j][6] = m1[j][4] - m1[j][6];
1414
0
    m2[j][7] = m1[j][5] - m1[j][7];
1415
0
    m2[j][8] = m1[j][8] + m1[j][10];
1416
0
    m2[j][9] = m1[j][9] + m1[j][11];
1417
0
    m2[j][10] = m1[j][8] - m1[j][10];
1418
0
    m2[j][11] = m1[j][9] - m1[j][11];
1419
0
    m2[j][12] = m1[j][12] + m1[j][14];
1420
0
    m2[j][13] = m1[j][13] + m1[j][15];
1421
0
    m2[j][14] = m1[j][12] - m1[j][14];
1422
0
    m2[j][15] = m1[j][13] - m1[j][15];
1423
1424
0
    m1[j][0] = m2[j][0] + m2[j][1];
1425
0
    m1[j][1] = m2[j][0] - m2[j][1];
1426
0
    m1[j][2] = m2[j][2] + m2[j][3];
1427
0
    m1[j][3] = m2[j][2] - m2[j][3];
1428
0
    m1[j][4] = m2[j][4] + m2[j][5];
1429
0
    m1[j][5] = m2[j][4] - m2[j][5];
1430
0
    m1[j][6] = m2[j][6] + m2[j][7];
1431
0
    m1[j][7] = m2[j][6] - m2[j][7];
1432
0
    m1[j][8] = m2[j][8] + m2[j][9];
1433
0
    m1[j][9] = m2[j][8] - m2[j][9];
1434
0
    m1[j][10] = m2[j][10] + m2[j][11];
1435
0
    m1[j][11] = m2[j][10] - m2[j][11];
1436
0
    m1[j][12] = m2[j][12] + m2[j][13];
1437
0
    m1[j][13] = m2[j][12] - m2[j][13];
1438
0
    m1[j][14] = m2[j][14] + m2[j][15];
1439
0
    m1[j][15] = m2[j][14] - m2[j][15];
1440
0
  }
1441
1442
  //vertical
1443
0
  for( i = 0; i < 16; i++ )
1444
0
  {
1445
0
    m2[0][i] = m1[0][i] + m1[4][i];
1446
0
    m2[1][i] = m1[1][i] + m1[5][i];
1447
0
    m2[2][i] = m1[2][i] + m1[6][i];
1448
0
    m2[3][i] = m1[3][i] + m1[7][i];
1449
0
    m2[4][i] = m1[0][i] - m1[4][i];
1450
0
    m2[5][i] = m1[1][i] - m1[5][i];
1451
0
    m2[6][i] = m1[2][i] - m1[6][i];
1452
0
    m2[7][i] = m1[3][i] - m1[7][i];
1453
1454
0
    m1[0][i] = m2[0][i] + m2[2][i];
1455
0
    m1[1][i] = m2[1][i] + m2[3][i];
1456
0
    m1[2][i] = m2[0][i] - m2[2][i];
1457
0
    m1[3][i] = m2[1][i] - m2[3][i];
1458
0
    m1[4][i] = m2[4][i] + m2[6][i];
1459
0
    m1[5][i] = m2[5][i] + m2[7][i];
1460
0
    m1[6][i] = m2[4][i] - m2[6][i];
1461
0
    m1[7][i] = m2[5][i] - m2[7][i];
1462
1463
0
    m2[0][i] = m1[0][i] + m1[1][i];
1464
0
    m2[1][i] = m1[0][i] - m1[1][i];
1465
0
    m2[2][i] = m1[2][i] + m1[3][i];
1466
0
    m2[3][i] = m1[2][i] - m1[3][i];
1467
0
    m2[4][i] = m1[4][i] + m1[5][i];
1468
0
    m2[5][i] = m1[4][i] - m1[5][i];
1469
0
    m2[6][i] = m1[6][i] + m1[7][i];
1470
0
    m2[7][i] = m1[6][i] - m1[7][i];
1471
0
  }
1472
1473
0
  for( i = 0; i < 8; i++ )
1474
0
  {
1475
0
    for( j = 0; j < 16; j++ )
1476
0
    {
1477
0
      sad += abs( m2[i][j] );
1478
0
    }
1479
0
  }
1480
  
1481
0
  sad -= abs( m2[0][0] );
1482
0
  sad += abs( m2[0][0] ) >> 2;
1483
0
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1484
1485
0
  return sad;
1486
0
}
1487
1488
static Distortion xCalcHADs8x16( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1489
0
{
1490
0
  int k, i, j, jj, sad = 0;
1491
0
  int diff[128], m1[16][8], m2[16][8];
1492
0
  for( k = 0; k < 128; k += 8 )
1493
0
  {
1494
0
    diff[k + 0] = piOrg[0] - piCur[0];
1495
0
    diff[k + 1] = piOrg[1] - piCur[1];
1496
0
    diff[k + 2] = piOrg[2] - piCur[2];
1497
0
    diff[k + 3] = piOrg[3] - piCur[3];
1498
0
    diff[k + 4] = piOrg[4] - piCur[4];
1499
0
    diff[k + 5] = piOrg[5] - piCur[5];
1500
0
    diff[k + 6] = piOrg[6] - piCur[6];
1501
0
    diff[k + 7] = piOrg[7] - piCur[7];
1502
1503
0
    piCur += iStrideCur;
1504
0
    piOrg += iStrideOrg;
1505
0
  }
1506
1507
  //horizontal
1508
0
  for( j = 0; j < 16; j++ )
1509
0
  {
1510
0
    jj = j << 3;
1511
1512
0
    m2[j][0] = diff[jj] + diff[jj + 4];
1513
0
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1514
0
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1515
0
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1516
0
    m2[j][4] = diff[jj] - diff[jj + 4];
1517
0
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1518
0
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1519
0
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1520
1521
0
    m1[j][0] = m2[j][0] + m2[j][2];
1522
0
    m1[j][1] = m2[j][1] + m2[j][3];
1523
0
    m1[j][2] = m2[j][0] - m2[j][2];
1524
0
    m1[j][3] = m2[j][1] - m2[j][3];
1525
0
    m1[j][4] = m2[j][4] + m2[j][6];
1526
0
    m1[j][5] = m2[j][5] + m2[j][7];
1527
0
    m1[j][6] = m2[j][4] - m2[j][6];
1528
0
    m1[j][7] = m2[j][5] - m2[j][7];
1529
1530
0
    m2[j][0] = m1[j][0] + m1[j][1];
1531
0
    m2[j][1] = m1[j][0] - m1[j][1];
1532
0
    m2[j][2] = m1[j][2] + m1[j][3];
1533
0
    m2[j][3] = m1[j][2] - m1[j][3];
1534
0
    m2[j][4] = m1[j][4] + m1[j][5];
1535
0
    m2[j][5] = m1[j][4] - m1[j][5];
1536
0
    m2[j][6] = m1[j][6] + m1[j][7];
1537
0
    m2[j][7] = m1[j][6] - m1[j][7];
1538
0
  }
1539
1540
  //vertical
1541
0
  for( i = 0; i < 8; i++ )
1542
0
  {
1543
0
    m1[0][i] = m2[0][i] + m2[8][i];
1544
0
    m1[1][i] = m2[1][i] + m2[9][i];
1545
0
    m1[2][i] = m2[2][i] + m2[10][i];
1546
0
    m1[3][i] = m2[3][i] + m2[11][i];
1547
0
    m1[4][i] = m2[4][i] + m2[12][i];
1548
0
    m1[5][i] = m2[5][i] + m2[13][i];
1549
0
    m1[6][i] = m2[6][i] + m2[14][i];
1550
0
    m1[7][i] = m2[7][i] + m2[15][i];
1551
0
    m1[8][i] = m2[0][i] - m2[8][i];
1552
0
    m1[9][i] = m2[1][i] - m2[9][i];
1553
0
    m1[10][i] = m2[2][i] - m2[10][i];
1554
0
    m1[11][i] = m2[3][i] - m2[11][i];
1555
0
    m1[12][i] = m2[4][i] - m2[12][i];
1556
0
    m1[13][i] = m2[5][i] - m2[13][i];
1557
0
    m1[14][i] = m2[6][i] - m2[14][i];
1558
0
    m1[15][i] = m2[7][i] - m2[15][i];
1559
1560
0
    m2[0][i] = m1[0][i] + m1[4][i];
1561
0
    m2[1][i] = m1[1][i] + m1[5][i];
1562
0
    m2[2][i] = m1[2][i] + m1[6][i];
1563
0
    m2[3][i] = m1[3][i] + m1[7][i];
1564
0
    m2[4][i] = m1[0][i] - m1[4][i];
1565
0
    m2[5][i] = m1[1][i] - m1[5][i];
1566
0
    m2[6][i] = m1[2][i] - m1[6][i];
1567
0
    m2[7][i] = m1[3][i] - m1[7][i];
1568
0
    m2[8][i] = m1[8][i] + m1[12][i];
1569
0
    m2[9][i] = m1[9][i] + m1[13][i];
1570
0
    m2[10][i] = m1[10][i] + m1[14][i];
1571
0
    m2[11][i] = m1[11][i] + m1[15][i];
1572
0
    m2[12][i] = m1[8][i] - m1[12][i];
1573
0
    m2[13][i] = m1[9][i] - m1[13][i];
1574
0
    m2[14][i] = m1[10][i] - m1[14][i];
1575
0
    m2[15][i] = m1[11][i] - m1[15][i];
1576
1577
0
    m1[0][i] = m2[0][i] + m2[2][i];
1578
0
    m1[1][i] = m2[1][i] + m2[3][i];
1579
0
    m1[2][i] = m2[0][i] - m2[2][i];
1580
0
    m1[3][i] = m2[1][i] - m2[3][i];
1581
0
    m1[4][i] = m2[4][i] + m2[6][i];
1582
0
    m1[5][i] = m2[5][i] + m2[7][i];
1583
0
    m1[6][i] = m2[4][i] - m2[6][i];
1584
0
    m1[7][i] = m2[5][i] - m2[7][i];
1585
0
    m1[8][i] = m2[8][i] + m2[10][i];
1586
0
    m1[9][i] = m2[9][i] + m2[11][i];
1587
0
    m1[10][i] = m2[8][i] - m2[10][i];
1588
0
    m1[11][i] = m2[9][i] - m2[11][i];
1589
0
    m1[12][i] = m2[12][i] + m2[14][i];
1590
0
    m1[13][i] = m2[13][i] + m2[15][i];
1591
0
    m1[14][i] = m2[12][i] - m2[14][i];
1592
0
    m1[15][i] = m2[13][i] - m2[15][i];
1593
1594
0
    m2[0][i] = m1[0][i] + m1[1][i];
1595
0
    m2[1][i] = m1[0][i] - m1[1][i];
1596
0
    m2[2][i] = m1[2][i] + m1[3][i];
1597
0
    m2[3][i] = m1[2][i] - m1[3][i];
1598
0
    m2[4][i] = m1[4][i] + m1[5][i];
1599
0
    m2[5][i] = m1[4][i] - m1[5][i];
1600
0
    m2[6][i] = m1[6][i] + m1[7][i];
1601
0
    m2[7][i] = m1[6][i] - m1[7][i];
1602
0
    m2[8][i] = m1[8][i] + m1[9][i];
1603
0
    m2[9][i] = m1[8][i] - m1[9][i];
1604
0
    m2[10][i] = m1[10][i] + m1[11][i];
1605
0
    m2[11][i] = m1[10][i] - m1[11][i];
1606
0
    m2[12][i] = m1[12][i] + m1[13][i];
1607
0
    m2[13][i] = m1[12][i] - m1[13][i];
1608
0
    m2[14][i] = m1[14][i] + m1[15][i];
1609
0
    m2[15][i] = m1[14][i] - m1[15][i];
1610
0
  }
1611
1612
0
  for( i = 0; i < 16; i++ )
1613
0
  {
1614
0
    for( j = 0; j < 8; j++ )
1615
0
    {
1616
0
      sad += abs( m2[i][j] );
1617
0
    }
1618
0
  }
1619
  
1620
0
  sad -= abs( m2[0][0] );
1621
0
  sad += abs( m2[0][0] ) >> 2;
1622
0
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1623
1624
0
  return sad;
1625
0
}
1626
1627
static Distortion xCalcHADs4x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1628
0
{
1629
0
  int k, i, j, jj, sad = 0;
1630
0
  int diff[32], m1[8][4], m2[8][4];
1631
0
  for( k = 0; k < 32; k += 4 )
1632
0
  {
1633
0
    diff[k + 0] = piOrg[0] - piCur[0];
1634
0
    diff[k + 1] = piOrg[1] - piCur[1];
1635
0
    diff[k + 2] = piOrg[2] - piCur[2];
1636
0
    diff[k + 3] = piOrg[3] - piCur[3];
1637
1638
0
    piCur += iStrideCur;
1639
0
    piOrg += iStrideOrg;
1640
0
  }
1641
1642
  //horizontal
1643
0
  for( j = 0; j < 8; j++ )
1644
0
  {
1645
0
    jj = j << 2;
1646
0
    m2[j][0] = diff[jj] + diff[jj + 2];
1647
0
    m2[j][1] = diff[jj + 1] + diff[jj + 3];
1648
0
    m2[j][2] = diff[jj] - diff[jj + 2];
1649
0
    m2[j][3] = diff[jj + 1] - diff[jj + 3];
1650
1651
0
    m1[j][0] = m2[j][0] + m2[j][1];
1652
0
    m1[j][1] = m2[j][0] - m2[j][1];
1653
0
    m1[j][2] = m2[j][2] + m2[j][3];
1654
0
    m1[j][3] = m2[j][2] - m2[j][3];
1655
0
  }
1656
1657
  //vertical
1658
0
  for( i = 0; i < 4; i++ )
1659
0
  {
1660
0
    m2[0][i] = m1[0][i] + m1[4][i];
1661
0
    m2[1][i] = m1[1][i] + m1[5][i];
1662
0
    m2[2][i] = m1[2][i] + m1[6][i];
1663
0
    m2[3][i] = m1[3][i] + m1[7][i];
1664
0
    m2[4][i] = m1[0][i] - m1[4][i];
1665
0
    m2[5][i] = m1[1][i] - m1[5][i];
1666
0
    m2[6][i] = m1[2][i] - m1[6][i];
1667
0
    m2[7][i] = m1[3][i] - m1[7][i];
1668
1669
0
    m1[0][i] = m2[0][i] + m2[2][i];
1670
0
    m1[1][i] = m2[1][i] + m2[3][i];
1671
0
    m1[2][i] = m2[0][i] - m2[2][i];
1672
0
    m1[3][i] = m2[1][i] - m2[3][i];
1673
0
    m1[4][i] = m2[4][i] + m2[6][i];
1674
0
    m1[5][i] = m2[5][i] + m2[7][i];
1675
0
    m1[6][i] = m2[4][i] - m2[6][i];
1676
0
    m1[7][i] = m2[5][i] - m2[7][i];
1677
1678
0
    m2[0][i] = m1[0][i] + m1[1][i];
1679
0
    m2[1][i] = m1[0][i] - m1[1][i];
1680
0
    m2[2][i] = m1[2][i] + m1[3][i];
1681
0
    m2[3][i] = m1[2][i] - m1[3][i];
1682
0
    m2[4][i] = m1[4][i] + m1[5][i];
1683
0
    m2[5][i] = m1[4][i] - m1[5][i];
1684
0
    m2[6][i] = m1[6][i] + m1[7][i];
1685
0
    m2[7][i] = m1[6][i] - m1[7][i];
1686
0
  }
1687
1688
0
  for( i = 0; i < 8; i++ )
1689
0
  {
1690
0
    for( j = 0; j < 4; j++ )
1691
0
    {
1692
0
      sad += abs( m2[i][j] );
1693
0
    }
1694
0
  }
1695
  
1696
0
  sad -= abs( m2[0][0] );
1697
0
  sad += abs( m2[0][0] ) >> 2;
1698
0
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1699
1700
0
  return sad;
1701
0
}
1702
1703
static Distortion xCalcHADs8x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1704
0
{
1705
0
  int k, i, j, jj, sad = 0;
1706
0
  int diff[32], m1[4][8], m2[4][8];
1707
0
  for( k = 0; k < 32; k += 8 )
1708
0
  {
1709
0
    diff[k + 0] = piOrg[0] - piCur[0];
1710
0
    diff[k + 1] = piOrg[1] - piCur[1];
1711
0
    diff[k + 2] = piOrg[2] - piCur[2];
1712
0
    diff[k + 3] = piOrg[3] - piCur[3];
1713
0
    diff[k + 4] = piOrg[4] - piCur[4];
1714
0
    diff[k + 5] = piOrg[5] - piCur[5];
1715
0
    diff[k + 6] = piOrg[6] - piCur[6];
1716
0
    diff[k + 7] = piOrg[7] - piCur[7];
1717
1718
0
    piCur += iStrideCur;
1719
0
    piOrg += iStrideOrg;
1720
0
  }
1721
1722
  //horizontal
1723
0
  for( j = 0; j < 4; j++ )
1724
0
  {
1725
0
    jj = j << 3;
1726
1727
0
    m2[j][0] = diff[jj] + diff[jj + 4];
1728
0
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1729
0
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1730
0
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1731
0
    m2[j][4] = diff[jj] - diff[jj + 4];
1732
0
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1733
0
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1734
0
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1735
1736
0
    m1[j][0] = m2[j][0] + m2[j][2];
1737
0
    m1[j][1] = m2[j][1] + m2[j][3];
1738
0
    m1[j][2] = m2[j][0] - m2[j][2];
1739
0
    m1[j][3] = m2[j][1] - m2[j][3];
1740
0
    m1[j][4] = m2[j][4] + m2[j][6];
1741
0
    m1[j][5] = m2[j][5] + m2[j][7];
1742
0
    m1[j][6] = m2[j][4] - m2[j][6];
1743
0
    m1[j][7] = m2[j][5] - m2[j][7];
1744
1745
0
    m2[j][0] = m1[j][0] + m1[j][1];
1746
0
    m2[j][1] = m1[j][0] - m1[j][1];
1747
0
    m2[j][2] = m1[j][2] + m1[j][3];
1748
0
    m2[j][3] = m1[j][2] - m1[j][3];
1749
0
    m2[j][4] = m1[j][4] + m1[j][5];
1750
0
    m2[j][5] = m1[j][4] - m1[j][5];
1751
0
    m2[j][6] = m1[j][6] + m1[j][7];
1752
0
    m2[j][7] = m1[j][6] - m1[j][7];
1753
0
  }
1754
1755
  //vertical
1756
0
  for( i = 0; i < 8; i++ )
1757
0
  {
1758
0
    m1[0][i] = m2[0][i] + m2[2][i];
1759
0
    m1[1][i] = m2[1][i] + m2[3][i];
1760
0
    m1[2][i] = m2[0][i] - m2[2][i];
1761
0
    m1[3][i] = m2[1][i] - m2[3][i];
1762
1763
0
    m2[0][i] = m1[0][i] + m1[1][i];
1764
0
    m2[1][i] = m1[0][i] - m1[1][i];
1765
0
    m2[2][i] = m1[2][i] + m1[3][i];
1766
0
    m2[3][i] = m1[2][i] - m1[3][i];
1767
0
  }
1768
1769
0
  for( i = 0; i < 4; i++ )
1770
0
  {
1771
0
    for( j = 0; j < 8; j++ )
1772
0
    {
1773
0
      sad += abs( m2[i][j] );
1774
0
    }
1775
0
  }
1776
  
1777
0
  sad -= abs( m2[0][0] );
1778
0
  sad += abs( m2[0][0] ) >> 2;
1779
0
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1780
1781
0
  return sad;
1782
0
}
1783
1784
Distortion RdCost::xGetHAD2SADs( const DistParam &rcDtParam )
1785
0
{
1786
0
  if( rcDtParam.applyWeight )
1787
0
  {
1788
0
    THROW(" no support");
1789
0
  }
1790
1791
0
  Distortion distHad = xGetHADs<false>( rcDtParam );
1792
0
  Distortion distSad = 0;
1793
0
  {
1794
0
    CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering");
1795
1796
0
    const Pel* piOrg  = rcDtParam.org.buf;
1797
0
    const Pel* piCur  = rcDtParam.cur.buf;
1798
0
    int  iRows        = rcDtParam.org.height>>2;
1799
0
    int  iCols        = rcDtParam.org.width<<2;
1800
1801
0
    Distortion uiSum = 0;
1802
1803
0
    for( int y = 0; y < iRows;  y++ )
1804
0
    {
1805
0
      for (int n = 0; n < iCols; n+=16 )
1806
0
      {
1807
0
        uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
1808
0
        uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
1809
0
        uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
1810
0
        uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
1811
0
        uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
1812
0
        uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
1813
0
        uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
1814
0
        uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
1815
0
        uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
1816
0
        uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
1817
0
        uiSum += abs( piOrg[n+10] - piCur[n+10] );
1818
0
        uiSum += abs( piOrg[n+11] - piCur[n+11] );
1819
0
        uiSum += abs( piOrg[n+12] - piCur[n+12] );
1820
0
        uiSum += abs( piOrg[n+13] - piCur[n+13] );
1821
0
        uiSum += abs( piOrg[n+14] - piCur[n+14] );
1822
0
        uiSum += abs( piOrg[n+15] - piCur[n+15] );
1823
0
      }
1824
0
      piOrg += iCols;
1825
0
      piCur += iCols;
1826
0
    }
1827
1828
0
    distSad = (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1829
0
  }
1830
1831
0
  return std::min( distHad, 2*distSad);
1832
0
}
1833
1834
template<bool fastHad>
1835
Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
1836
0
{
1837
0
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
0
  const Pel* piOrg = rcDtParam.org.buf;
1842
0
  const Pel* piCur = rcDtParam.cur.buf;
1843
0
  const int  iRows = rcDtParam.org.height;
1844
0
  const int  iCols = rcDtParam.org.width;
1845
0
  const int  iStrideCur = rcDtParam.cur.stride;
1846
0
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
0
  int  x = 0, y = 0;
1849
1850
0
  Distortion uiSum = 0;
1851
1852
0
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
0
  {
1854
0
    for( y = 0; y < iRows; y += 8 )
1855
0
    {
1856
0
      for( x = 0; x < iCols; x += 16 )
1857
0
      {
1858
0
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
0
      }
1860
0
      piOrg += iStrideOrg * 8;
1861
0
      piCur += iStrideCur * 8;
1862
0
    }
1863
0
  }
1864
0
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
0
  {
1866
0
    for( y = 0; y < iRows; y += 16 )
1867
0
    {
1868
0
      for( x = 0; x < iCols; x += 8 )
1869
0
      {
1870
0
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
0
      }
1872
0
      piOrg += iStrideOrg * 16;
1873
0
      piCur += iStrideCur * 16;
1874
0
    }
1875
0
  }
1876
0
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
0
  {
1878
0
    for( y = 0; y < iRows; y += 4 )
1879
0
    {
1880
0
      for( x = 0; x < iCols; x += 8 )
1881
0
      {
1882
0
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
0
      }
1884
0
      piOrg += iStrideOrg * 4;
1885
0
      piCur += iStrideCur * 4;
1886
0
    }
1887
0
  }
1888
0
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
0
  {
1890
0
    for( y = 0; y < iRows; y += 8 )
1891
0
    {
1892
0
      for( x = 0; x < iCols; x += 4 )
1893
0
      {
1894
0
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
0
      }
1896
0
      piOrg += iStrideOrg * 8;
1897
0
      piCur += iStrideCur * 8;
1898
0
    }
1899
0
  }
1900
0
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
0
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
0
  {
1914
0
    for( y = 0; y < iRows; y += 8 )
1915
0
    {
1916
0
      for( x = 0; x < iCols; x += 8 )
1917
0
      {
1918
0
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
0
      }
1920
0
      piOrg += 8*iStrideOrg;
1921
0
      piCur += 8*iStrideCur;
1922
0
    }
1923
0
  }
1924
0
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
0
  {
1926
0
    for( y = 0; y < iRows; y += 4 )
1927
0
    {
1928
0
      for( x = 0; x < iCols; x += 4 )
1929
0
      {
1930
0
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
0
      }
1932
0
      piOrg += 4*iStrideOrg;
1933
0
      piCur += 4*iStrideCur;
1934
0
    }
1935
0
  }
1936
0
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
0
  {
1938
0
    for( y = 0; y < iRows; y += 2 )
1939
0
    {
1940
0
      for( x = 0; x < iCols; x += 2 )
1941
0
      {
1942
0
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
0
      }
1944
0
      piOrg += 2*iStrideOrg;
1945
0
      piCur += 2*iStrideCur;
1946
0
    }
1947
0
  }
1948
0
  else
1949
0
  {
1950
0
    THROW( "Invalid size" );
1951
0
  }
1952
1953
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
0
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs<false>(vvenc::DistParam const&)
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs<true>(vvenc::DistParam const&)
1955
1956
1957
void RdCost::saveUnadjustedLambda()
1958
0
{
1959
0
  m_dLambda_unadjusted = m_dLambda;
1960
0
  m_DistScaleUnadjusted = m_DistScale;
1961
0
}
1962
1963
1964
inline Distortion getWeightedMSE(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift)
1965
0
{
1966
0
  const Intermediate_Int iTemp = org - cur;
1967
0
  return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift);
1968
0
}
1969
1970
template<int csx>
1971
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights )
1972
0
{
1973
0
        int  iRows = rcDtParam.org.height;
1974
0
  const Pel* piOrg = rcDtParam.org.buf;
1975
0
  const Pel* piCur = rcDtParam.cur.buf;
1976
0
  const int  iCols = rcDtParam.org.width;
1977
0
  const int  iStrideCur = rcDtParam.cur.stride;
1978
0
  const int  iStrideOrg = rcDtParam.org.stride;
1979
0
  const Pel* piOrgLuma        = rcDtParam.orgLuma->buf;
1980
0
  const int  iStrideOrgLuma   = rcDtParam.orgLuma->stride;
1981
1982
0
  Distortion uiSum   = 0;
1983
0
  uint32_t uiShift   = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
1984
1985
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
1986
0
  const int cf =  1 - ( iCols & 1 );
1987
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
1988
0
  const ComponentID compId = rcDtParam.compID;
1989
0
  const size_t  cShiftY    = getComponentScaleY(compId, chmFmt);
1990
1991
0
  for( ; iRows != 0; iRows-- )
1992
0
  {
1993
0
    for (int n = 0; n < iCols; n+=2 )
1994
0
    {
1995
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], lumaWeights[piOrgLuma[(n   )<<csx]], uiShift );
1996
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], lumaWeights[piOrgLuma[(n+cf)<<csx]], uiShift );
1997
0
    }
1998
1999
0
    piOrg     += iStrideOrg;
2000
0
    piCur     += iStrideCur;
2001
0
    piOrgLuma += iStrideOrgLuma<<cShiftY;
2002
0
  }
2003
2004
0
  return ( uiSum >> ( 1 - cf ) );
2005
0
}
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
2006
2007
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedPTweight )
2008
0
{
2009
0
        int  iRows = rcDtParam.org.height;
2010
0
  const Pel* piOrg = rcDtParam.org.buf;
2011
0
  const Pel* piCur = rcDtParam.cur.buf;
2012
0
  const int  iCols = rcDtParam.org.width;
2013
0
  const int  iStrideCur = rcDtParam.cur.stride;
2014
0
  const int  iStrideOrg = rcDtParam.org.stride;
2015
2016
0
  Distortion uiSum   = 0;
2017
0
  uint32_t uiShift = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
2018
2019
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
2020
0
  const int cf =  1 - ( iCols & 1 );
2021
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
2022
  
2023
0
  for( ; iRows != 0; iRows-- )
2024
0
  {
2025
0
    for (int n = 0; n < iCols; n+=2 )
2026
0
    {
2027
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], fixedPTweight, uiShift );
2028
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], fixedPTweight, uiShift );
2029
0
    }
2030
0
    piOrg += iStrideOrg;
2031
0
    piCur += iStrideCur;
2032
0
  }
2033
2034
0
  return ( uiSum >> ( 1 - cf ) );
2035
0
}
2036
2037
Distortion RdCost::xGetSSE_WTD( const DistParam &rcDtParam ) const
2038
0
{
2039
0
  if( rcDtParam.applyWeight )
2040
0
  {
2041
0
    THROW("no support");
2042
0
  }
2043
2044
0
  if ((m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG) && rcDtParam.compID != COMP_Y)
2045
0
  {
2046
0
    const uint32_t fixedPTweight = ( uint32_t ) ( m_chromaWeight * ( double ) ( 1 << 16 ) );
2047
2048
0
    return m_fxdWtdPredPtr( rcDtParam, fixedPTweight );
2049
0
  }
2050
0
  else
2051
0
  {
2052
0
    return m_wtdPredPtr[getComponentScaleX(rcDtParam.compID, m_cf)]( rcDtParam, m_cf, m_reshapeLumaLevelToWeightPLUT );
2053
0
  }
2054
2055
0
  return 0;
2056
0
}
2057
2058
0
void RdCost::xGetSAD8X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2059
0
  DistParam rcDtParamTmp0 = rcDtParam;
2060
2061
0
  DistParam rcDtParamTmp1 = rcDtParam;
2062
0
  rcDtParamTmp1.org.buf += 1;
2063
0
  rcDtParamTmp1.cur.buf -= 1;
2064
2065
0
  DistParam rcDtParamTmp2 = rcDtParam;
2066
0
  rcDtParamTmp2.org.buf += 2;
2067
0
  rcDtParamTmp2.cur.buf -= 2;
2068
2069
0
  DistParam rcDtParamTmp3 = rcDtParam;
2070
0
  rcDtParamTmp3.org.buf += 3;
2071
0
  rcDtParamTmp3.cur.buf -= 3;
2072
2073
0
  DistParam rcDtParamTmp4 = rcDtParam;
2074
0
  rcDtParamTmp4.org.buf += 4;
2075
0
  rcDtParamTmp4.cur.buf -= 4;
2076
  
2077
0
  cost[0] = (RdCost::xGetSAD8(rcDtParamTmp0)) >> 1;
2078
0
  cost[1] = (RdCost::xGetSAD8(rcDtParamTmp1)) >> 1;
2079
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD8(rcDtParamTmp2)) >> 1;
2080
0
  cost[3] = (RdCost::xGetSAD8(rcDtParamTmp3)) >> 1;
2081
0
  cost[4] = (RdCost::xGetSAD8(rcDtParamTmp4)) >> 1;
2082
0
}
2083
2084
0
void RdCost::xGetSAD16X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2085
0
  DistParam rcDtParamTmp0 = rcDtParam;
2086
2087
0
  DistParam rcDtParamTmp1 = rcDtParam;
2088
0
  rcDtParamTmp1.org.buf += 1;
2089
0
  rcDtParamTmp1.cur.buf -= 1;
2090
2091
0
  DistParam rcDtParamTmp2 = rcDtParam;
2092
0
  rcDtParamTmp2.org.buf += 2;
2093
0
  rcDtParamTmp2.cur.buf -= 2;
2094
2095
0
  DistParam rcDtParamTmp3 = rcDtParam;
2096
0
  rcDtParamTmp3.org.buf += 3;
2097
0
  rcDtParamTmp3.cur.buf -= 3;
2098
2099
0
  DistParam rcDtParamTmp4 = rcDtParam;
2100
0
  rcDtParamTmp4.org.buf += 4;
2101
0
  rcDtParamTmp4.cur.buf -= 4;
2102
  
2103
0
  cost[0] = (RdCost::xGetSAD16(rcDtParamTmp0)) >> 1;
2104
0
  cost[1] = (RdCost::xGetSAD16(rcDtParamTmp1)) >> 1;
2105
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD16(rcDtParamTmp2)) >> 1;
2106
0
  cost[3] = (RdCost::xGetSAD16(rcDtParamTmp3)) >> 1;
2107
0
  cost[4] = (RdCost::xGetSAD16(rcDtParamTmp4)) >> 1;
2108
0
}
2109
2110
void RdCost::setDistParamGeo(DistParam &rcDP, const CPelBuf &org, const Pel *piRefY, int iRefStride, const Pel *mask,
2111
                          int iMaskStride, int stepX, int iMaskStride2, int bitDepth, ComponentID compID)
2112
0
{
2113
0
  rcDP.bitDepth = bitDepth;
2114
0
  rcDP.compID   = compID;
2115
2116
  // set Original & Curr Pointer / Stride
2117
0
  rcDP.org        = org;
2118
0
  rcDP.cur.buf    = piRefY;
2119
0
  rcDP.cur.stride = iRefStride;
2120
2121
  // set Mask
2122
0
  rcDP.mask        = mask;
2123
0
  rcDP.maskStride  = iMaskStride;
2124
0
  rcDP.stepX       = stepX;
2125
0
  rcDP.maskStride2 = iMaskStride2;
2126
2127
  // set Block Width / Height
2128
0
  rcDP.cur.width                     = org.width;
2129
0
  rcDP.cur.height                    = org.height;
2130
0
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
2131
2132
  // set Cost function for motion estimation with Mask
2133
0
  rcDP.distFunc = m_afpDistortFunc[0][DF_SAD_WITH_MASK];
2134
0
}
2135
2136
Distortion RdCost::xGetSADwMask(const DistParam &rcDtParam)
2137
0
{
2138
0
  const Pel *    org             = rcDtParam.org.buf;
2139
0
  const Pel *    cur             = rcDtParam.cur.buf;
2140
0
  const Pel *    mask            = rcDtParam.mask;
2141
0
  const int      cols            = rcDtParam.org.width;
2142
0
  int            rows            = rcDtParam.org.height;
2143
0
  const int      subShift        = rcDtParam.subShift;
2144
0
  const int      subStep         = (1 << subShift);
2145
0
  const int      strideCur       = rcDtParam.cur.stride * subStep;
2146
0
  const int      strideOrg       = rcDtParam.org.stride * subStep;
2147
0
  const int      strideMask      = rcDtParam.maskStride * subStep;
2148
0
  const int      stepX           = rcDtParam.stepX;
2149
0
  const int      strideMask2     = rcDtParam.maskStride2;
2150
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2151
2152
0
  Distortion sum = 0;
2153
0
  for (; rows != 0; rows -= subStep)
2154
0
  {
2155
0
    for (int n = 0; n < cols; n++)
2156
0
    {
2157
0
      sum += abs(org[n] - cur[n]) * *mask;
2158
0
      mask += stepX;
2159
0
    }
2160
0
    org += strideOrg;
2161
0
    cur += strideCur;
2162
0
    mask += strideMask;
2163
0
    mask += strideMask2;
2164
0
  }
2165
0
  sum <<= subShift;
2166
0
  return (sum >> distortionShift);
2167
0
}
2168
2169
Distortion RdCost::getBvCostMultiplePredsIBC(int x, int y, bool useIMV)
2170
0
{
2171
0
  return Distortion(m_dCostIBC * getBitsMultiplePredsIBC(x, y, useIMV));
2172
0
}
2173
2174
static inline unsigned getIComponentBitsIBC( int val )
2175
0
{
2176
0
  if( !val ) return 1;
2177
2178
0
  const unsigned int l2 = floorLog2( (val <= 0) ? (-val << 1) + 1 : (val << 1) );
2179
2180
0
  return (l2 << 1) + 1;
2181
0
}
2182
2183
unsigned int RdCost::getBitsMultiplePredsIBC(int x, int y, bool useIMV)
2184
0
{
2185
0
  int rmvH[2];
2186
0
  int rmvV[2];
2187
0
  rmvH[0] = x - m_bvPredictors[0].hor;
2188
0
  rmvH[1] = x - m_bvPredictors[1].hor;
2189
2190
0
  rmvV[0] = y - m_bvPredictors[0].ver;
2191
0
  rmvV[1] = y - m_bvPredictors[1].ver;
2192
0
  int absCand[2];
2193
0
  absCand[0] = abs(rmvH[0]) + abs(rmvV[0]);
2194
0
  absCand[1] = abs(rmvH[1]) + abs(rmvV[1]);
2195
2196
0
  if (useIMV && x % 4 == 0 && y % 4 == 0)
2197
0
  {
2198
0
    int rmvHQP[2];
2199
0
    int rmvVQP[2];
2200
2201
0
    int imvShift = 2;
2202
0
    int offset = 1 << (imvShift - 1);
2203
2204
0
    rmvHQP[0] = (x >> 2) - ((m_bvPredictors[0].hor + offset) >> 2);
2205
0
    rmvHQP[1] = (x >> 2) - ((m_bvPredictors[1].hor + offset) >> 2);
2206
0
    rmvVQP[0] = (y >> 2) - ((m_bvPredictors[0].ver + offset) >> 2);
2207
0
    rmvVQP[1] = (y >> 2) - ((m_bvPredictors[1].ver + offset) >> 2);
2208
2209
0
    int absCandQP[2];
2210
0
    absCandQP[0] = abs(rmvHQP[0]) + abs(rmvVQP[0]);
2211
0
    absCandQP[1] = abs(rmvHQP[1]) + abs(rmvVQP[1]);
2212
0
    unsigned int candBits0QP, candBits1QP;
2213
0
    if (absCand[0] < absCand[1])
2214
0
    {
2215
0
      unsigned int candBits0 = getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2216
0
      if (absCandQP[0] < absCandQP[1])
2217
0
      {
2218
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2219
0
        return candBits0QP < candBits0 ? candBits0QP : candBits0;
2220
0
      }
2221
0
      else
2222
0
      {
2223
0
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2224
0
        return candBits1QP < candBits0 ? candBits1QP : candBits0;
2225
0
      }
2226
0
    }
2227
0
    else
2228
0
    {
2229
0
      unsigned int candBits1 = getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2230
0
      if (absCandQP[0] < absCandQP[1])
2231
0
      {
2232
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2233
0
        return candBits0QP < candBits1 ? candBits0QP : candBits1;
2234
0
      }
2235
0
      else
2236
0
      {
2237
0
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2238
0
        return candBits1QP < candBits1 ? candBits1QP : candBits1;
2239
0
      }
2240
0
    }
2241
0
  }
2242
0
  else
2243
0
  {
2244
0
    if (absCand[0] < absCand[1])
2245
0
    {
2246
0
      return getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2247
0
    }
2248
0
    else
2249
0
    {
2250
0
      return getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2251
0
    }
2252
0
  }
2253
0
}
2254
2255
} // namespace vvenc
2256
2257
//! \}
2258