Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/CommonLib/RdCost.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     RdCost.cpp
45
    \brief    RD cost computation class
46
*/
47
48
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP
49
50
#include "RdCost.h"
51
#include "Rom.h"
52
#include "UnitPartitioner.h"
53
#include "SearchSpaceCounter.h"
54
55
56
//! \ingroup CommonLib
57
//! \{
58
59
namespace vvenc {
60
61
62
template<int csx>
63
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights );
64
65
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedWeight );
66
67
RdCost::RdCost()
68
17.3k
  : m_afpDistortFunc{ { nullptr, }, { nullptr, } }
69
17.3k
{
70
17.3k
}
71
72
RdCost::~RdCost()
73
17.3k
{
74
17.3k
}
75
76
void RdCost::setLambda( double dLambda, const BitDepths &bitDepths )
77
15.3k
{
78
15.3k
  m_dLambda          = dLambda;
79
15.3k
  m_DistScale        = double(1<<SCALE_BITS) / m_dLambda;
80
15.3k
  m_dLambdaMotionSAD = sqrt(m_dLambda);
81
15.3k
}
82
83
84
// Initialize Function Pointer by [eDFunc]
85
void RdCost::create( bool enableOpt )
86
17.3k
{
87
17.3k
  m_signalType                 = RESHAPE_SIGNAL_NULL;
88
17.3k
  m_chromaWeight               = 1.0;
89
17.3k
  m_lumaBD                     = 10;
90
17.3k
  m_afpDistortFunc[0][DF_SSE    ] = RdCost::xGetSSE;
91
17.3k
  m_afpDistortFunc[0][DF_SSE2   ] = RdCost::xGetSSE;
92
17.3k
  m_afpDistortFunc[0][DF_SSE4   ] = RdCost::xGetSSE4;
93
17.3k
  m_afpDistortFunc[0][DF_SSE8   ] = RdCost::xGetSSE8;
94
17.3k
  m_afpDistortFunc[0][DF_SSE16  ] = RdCost::xGetSSE16;
95
17.3k
  m_afpDistortFunc[0][DF_SSE32  ] = RdCost::xGetSSE32;
96
17.3k
  m_afpDistortFunc[0][DF_SSE64  ] = RdCost::xGetSSE64;
97
17.3k
  m_afpDistortFunc[0][DF_SSE128 ] = RdCost::xGetSSE128;
98
99
17.3k
  m_afpDistortFunc[0][DF_SAD    ] = RdCost::xGetSAD;
100
17.3k
  m_afpDistortFunc[0][DF_SAD2   ] = RdCost::xGetSAD;
101
17.3k
  m_afpDistortFunc[0][DF_SAD4   ] = RdCost::xGetSAD4;
102
17.3k
  m_afpDistortFunc[0][DF_SAD8   ] = RdCost::xGetSAD8;
103
17.3k
  m_afpDistortFunc[0][DF_SAD16  ] = RdCost::xGetSAD16;
104
17.3k
  m_afpDistortFunc[0][DF_SAD32  ] = RdCost::xGetSAD32;
105
17.3k
  m_afpDistortFunc[0][DF_SAD64  ] = RdCost::xGetSAD64;
106
17.3k
  m_afpDistortFunc[0][DF_SAD128 ] = RdCost::xGetSAD128;
107
108
17.3k
  m_afpDistortFunc[0][DF_HAD    ] = RdCost::xGetHADs<false>;
109
17.3k
  m_afpDistortFunc[0][DF_HAD2   ] = RdCost::xGetHADs<false>;
110
17.3k
  m_afpDistortFunc[0][DF_HAD4   ] = RdCost::xGetHADs<false>;
111
17.3k
  m_afpDistortFunc[0][DF_HAD8   ] = RdCost::xGetHADs<false>;
112
17.3k
  m_afpDistortFunc[0][DF_HAD16  ] = RdCost::xGetHADs<false>;
113
17.3k
  m_afpDistortFunc[0][DF_HAD32  ] = RdCost::xGetHADs<false>;
114
17.3k
  m_afpDistortFunc[0][DF_HAD64  ] = RdCost::xGetHADs<false>;
115
17.3k
  m_afpDistortFunc[0][DF_HAD128 ] = RdCost::xGetHADs<false>;
116
117
17.3k
  m_afpDistortFunc[0][DF_HAD_fast    ] = RdCost::xGetHADs<true>;
118
17.3k
  m_afpDistortFunc[0][DF_HAD2_fast   ] = RdCost::xGetHADs<true>;
119
17.3k
  m_afpDistortFunc[0][DF_HAD4_fast   ] = RdCost::xGetHADs<true>;
120
17.3k
  m_afpDistortFunc[0][DF_HAD8_fast   ] = RdCost::xGetHADs<true>;
121
17.3k
  m_afpDistortFunc[0][DF_HAD16_fast  ] = RdCost::xGetHADs<true>;
122
17.3k
  m_afpDistortFunc[0][DF_HAD32_fast  ] = RdCost::xGetHADs<true>;
123
17.3k
  m_afpDistortFunc[0][DF_HAD64_fast  ] = RdCost::xGetHADs<true>;
124
17.3k
  m_afpDistortFunc[0][DF_HAD128_fast ] = RdCost::xGetHADs<true>;
125
126
  //  m_afpDistortFunc[0][DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;
127
17.3k
  m_afpDistortFunc[0][DF_HAD_2SAD ] = RdCost::xGetHAD2SADs;
128
129
17.3k
  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = RdCost::xGetSADwMask;
130
  // m_afpDistortFunc[1] can be used in any case
131
17.3k
  memcpy( m_afpDistortFunc[1], m_afpDistortFunc[0], sizeof(m_afpDistortFunc)/2);
132
133
17.3k
  m_wtdPredPtr[0] = lumaWeightedSSE_Core<0>;
134
17.3k
  m_wtdPredPtr[1] = lumaWeightedSSE_Core<1>;
135
17.3k
  m_fxdWtdPredPtr = fixWeightedSSE_Core;
136
137
17.3k
  m_afpDistortFuncX5[0] = RdCost::xGetSAD8X5;
138
17.3k
  m_afpDistortFuncX5[1] = RdCost::xGetSAD16X5;
139
140
17.3k
#if ENABLE_SIMD_OPT_DIST
141
17.3k
  if( enableOpt )
142
17.3k
  {
143
#ifdef TARGET_SIMD_X86
144
    initRdCostX86();
145
#endif
146
#ifdef TARGET_SIMD_ARM
147
    initRdCostARM();
148
#endif
149
17.3k
  }
150
17.3k
#endif
151
152
17.3k
  m_costMode      = VVENC_COST_STANDARD_LOSSY;
153
17.3k
  m_motionLambda  = 0;
154
17.3k
  m_iCostScale    = 0;
155
17.3k
}
156
157
#if ENABLE_MEASURE_SEARCH_SPACE
158
static Distortion xMeasurePredSearchSpaceInterceptor( const DistParam& dp )
159
{
160
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
161
  return dp.xDistFunc( dp );
162
}
163
164
#endif
165
void RdCost::setDistParam( DistParam &rcDP, const CPelBuf& org, const Pel* piRefY, int iRefStride, int bitDepth, ComponentID compID, int subShiftMode, int useHadamard )
166
45.3k
{
167
45.3k
  rcDP.bitDepth   = bitDepth;
168
45.3k
  rcDP.compID     = compID;
169
170
  // set Original & Curr Pointer / Stride
171
45.3k
  rcDP.org        = org;
172
173
45.3k
  rcDP.cur.buf    = piRefY;
174
45.3k
  rcDP.cur.stride = iRefStride;
175
176
  // set Block Width / Height
177
45.3k
  rcDP.cur.width    = org.width;
178
45.3k
  rcDP.cur.height   = org.height;
179
45.3k
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
180
181
45.3k
  const int base = (rcDP.bitDepth > 10 || rcDP.applyWeight) ? 1 : 0;
182
45.3k
  if( !useHadamard )
183
45.3k
  {
184
45.3k
    rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( org.width ) ];
185
45.3k
  }
186
0
  else
187
0
  {
188
0
    rcDP.distFunc = m_afpDistortFunc[base][( useHadamard == 1 ? DF_HAD : DF_HAD_fast ) + Log2( org.width ) ];
189
0
  }
190
191
  // initialize
192
45.3k
  rcDP.subShift  = 0;
193
194
45.3k
  if( subShiftMode == 1 )
195
0
  {
196
0
    if( rcDP.org.height > 8 && rcDP.org.width <= 128 )
197
0
    {
198
0
      rcDP.subShift = 1;
199
0
    }
200
0
  }
201
45.3k
  else if( subShiftMode == 2 )
202
0
  {
203
0
    if (rcDP.org.height > 8)
204
0
    {
205
0
      rcDP.subShift = 1;
206
0
    }
207
0
  }
208
209
#if ENABLE_MEASURE_SEARCH_SPACE
210
  rcDP.xDistFunc = rcDP.distFunc;
211
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
212
#endif
213
45.3k
}
214
215
216
DistParam RdCost::setDistParam( const CPelBuf& org, const CPelBuf& cur, int bitDepth, DFunc dfunc )
217
250k
{
218
250k
  int index = dfunc;
219
250k
  if( dfunc != DF_HAD && dfunc != DF_HAD_fast && dfunc != DF_HAD_2SAD )
220
102k
  {
221
102k
    index += Log2(org.width);
222
102k
  }
223
224
250k
  const int base = bitDepth > 10 ? 1:0; //TBD: check does SDA ever overflow
225
#if ENABLE_MEASURE_SEARCH_SPACE
226
  DistParam rcDP( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
227
  rcDP.xDistFunc = rcDP.distFunc;
228
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
229
  return rcDP;
230
#else
231
250k
  return DistParam( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
232
250k
#endif
233
250k
}
234
235
DistParam RdCost::setDistParam( const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShift, bool isDMVR )
236
0
{
237
0
  DistParam rcDP;
238
0
  rcDP.bitDepth   = bitDepth;
239
0
  rcDP.compID     = compID;
240
241
0
  rcDP.org.buf    = pOrg;
242
0
  rcDP.org.stride = iOrgStride;
243
0
  rcDP.org.width  = width;
244
0
  rcDP.org.height = height;
245
246
0
  rcDP.cur.buf    = piRefY;
247
0
  rcDP.cur.stride = iRefStride;
248
0
  rcDP.cur.width  = width;
249
0
  rcDP.cur.height = height;
250
0
  rcDP.subShift   = subShift;
251
252
  //  CHECK( useHadamard || rcDP.useMR, "only used in xDMVRCost with these default parameters (so far...)" );
253
0
  const int base = (rcDP.bitDepth > 10) ? 1 : 0;
254
255
0
  rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( width ) ];
256
  
257
0
  if( isDMVR )
258
0
  {
259
0
    rcDP.dmvrSadX5 = m_afpDistortFuncX5[Log2( width ) - 3];
260
0
  }
261
262
#if ENABLE_MEASURE_SEARCH_SPACE
263
  if( !isDMVR )
264
  {
265
    // DMVT is part of the decoder complexity
266
    rcDP.xDistFunc = rcDP.distFunc;
267
    rcDP.distFunc = xMeasurePredSearchSpaceInterceptor;
268
  }
269
270
#endif
271
0
  return rcDP;
272
0
}
273
274
Distortion RdCost::getDistPart( const CPelBuf& org, const CPelBuf& cur, int bitDepth, const ComponentID compId, DFunc eDFunc, const CPelBuf* orgLuma )
275
2.45M
{
276
2.45M
  DistParam dp( org, cur, nullptr, bitDepth, 0, compId );
277
# if ENABLE_MEASURE_SEARCH_SPACE
278
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
279
#endif
280
2.45M
  Distortion dist;
281
2.45M
  if( orgLuma )
282
0
  {
283
0
    CHECKD( eDFunc != DF_SSE_WTD, "mismatch func and parameter")
284
0
    dp.orgLuma  = orgLuma;
285
0
    dist = RdCost::xGetSSE_WTD( dp );
286
0
  }
287
2.45M
  else
288
2.45M
  {
289
2.45M
    if( ( org.width == 1 ) )
290
0
    {
291
0
      dist = xGetSSE( dp );
292
0
    }
293
2.45M
    else
294
2.45M
    {
295
2.45M
      const int base = (bitDepth > 10) ? 1 : 0;
296
2.45M
      dist = m_afpDistortFunc[base][eDFunc + Log2(org.width)](dp);
297
2.45M
    }
298
2.45M
  }
299
2.45M
  if (isChroma(compId))
300
2.06M
  {
301
2.06M
    return ((Distortion) (m_distortionWeight[ compId ] * dist));
302
2.06M
  }
303
396k
  else
304
396k
  {
305
396k
    return dist;
306
396k
  }
307
2.45M
}
308
309
// ====================================================================================================================
310
// Distortion functions
311
// ====================================================================================================================
312
313
// --------------------------------------------------------------------------------------------------------------------
314
// SAD
315
// --------------------------------------------------------------------------------------------------------------------
316
317
Distortion RdCost::xGetSAD( const DistParam& rcDtParam )
318
0
{
319
0
  if ( rcDtParam.applyWeight )
320
0
  {
321
0
    THROW(" no support");
322
0
  }
323
324
0
  const Pel* piOrg           = rcDtParam.org.buf;
325
0
  const Pel* piCur           = rcDtParam.cur.buf;
326
0
  const int  iCols           = rcDtParam.org.width;
327
0
        int  iRows           = rcDtParam.org.height;
328
0
  const int  iSubShift       = rcDtParam.subShift;
329
0
  const int  iSubStep        = ( 1 << iSubShift );
330
0
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
331
0
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
332
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
333
334
0
  Distortion uiSum = 0;
335
336
0
  for( ; iRows != 0; iRows -= iSubStep )
337
0
  {
338
0
    for (int n = 0; n < iCols; n++ )
339
0
    {
340
0
      uiSum += abs( piOrg[n] - piCur[n] );
341
0
    }
342
0
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
343
0
    {
344
0
      return ( uiSum >> distortionShift );
345
0
    }
346
0
    piOrg += iStrideOrg;
347
0
    piCur += iStrideCur;
348
0
  }
349
350
0
  uiSum <<= iSubShift;
351
0
  return ( uiSum >> distortionShift );
352
0
}
353
354
Distortion RdCost::xGetSAD4( const DistParam& rcDtParam )
355
120k
{
356
120k
  if ( rcDtParam.applyWeight )
357
0
  {
358
0
    THROW(" no support");
359
0
  }
360
361
120k
  const Pel* piOrg   = rcDtParam.org.buf;
362
120k
  const Pel* piCur   = rcDtParam.cur.buf;
363
120k
  int  iRows         = rcDtParam.org.height;
364
120k
  int  iSubShift     = rcDtParam.subShift;
365
120k
  int  iSubStep      = ( 1 << iSubShift );
366
120k
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
367
120k
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;
368
369
120k
  Distortion uiSum = 0;
370
371
1.48M
  for( ; iRows != 0; iRows -= iSubStep )
372
1.36M
  {
373
1.36M
    uiSum += abs( piOrg[0] - piCur[0] );
374
1.36M
    uiSum += abs( piOrg[1] - piCur[1] );
375
1.36M
    uiSum += abs( piOrg[2] - piCur[2] );
376
1.36M
    uiSum += abs( piOrg[3] - piCur[3] );
377
378
1.36M
    piOrg += iStrideOrg;
379
1.36M
    piCur += iStrideCur;
380
1.36M
  }
381
382
120k
  uiSum <<= iSubShift;
383
120k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
384
120k
}
385
386
Distortion RdCost::xGetSAD8( const DistParam& rcDtParam )
387
484k
{
388
484k
  if ( rcDtParam.applyWeight )
389
0
  {
390
0
    THROW(" no support");
391
0
  }
392
393
484k
  const Pel* piOrg      = rcDtParam.org.buf;
394
484k
  const Pel* piCur      = rcDtParam.cur.buf;
395
484k
  int  iRows            = rcDtParam.org.height;
396
484k
  int  iSubShift        = rcDtParam.subShift;
397
484k
  int  iSubStep         = ( 1 << iSubShift );
398
484k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
399
484k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
400
401
484k
  Distortion uiSum = 0;
402
403
10.2M
  for( ; iRows != 0; iRows-=iSubStep )
404
9.78M
  {
405
9.78M
    uiSum += abs( piOrg[0] - piCur[0] );
406
9.78M
    uiSum += abs( piOrg[1] - piCur[1] );
407
9.78M
    uiSum += abs( piOrg[2] - piCur[2] );
408
9.78M
    uiSum += abs( piOrg[3] - piCur[3] );
409
9.78M
    uiSum += abs( piOrg[4] - piCur[4] );
410
9.78M
    uiSum += abs( piOrg[5] - piCur[5] );
411
9.78M
    uiSum += abs( piOrg[6] - piCur[6] );
412
9.78M
    uiSum += abs( piOrg[7] - piCur[7] );
413
414
9.78M
    piOrg += iStrideOrg;
415
9.78M
    piCur += iStrideCur;
416
9.78M
  }
417
418
484k
  uiSum <<= iSubShift;
419
484k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
420
484k
}
421
422
Distortion RdCost::xGetSAD16( const DistParam& rcDtParam )
423
425k
{
424
425k
  if ( rcDtParam.applyWeight )
425
0
  {
426
0
    THROW(" no support");
427
0
  }
428
429
425k
  const Pel* piOrg      = rcDtParam.org.buf;
430
425k
  const Pel* piCur      = rcDtParam.cur.buf;
431
425k
  int  iRows            = rcDtParam.org.height;
432
425k
  int  iSubShift        = rcDtParam.subShift;
433
425k
  int  iSubStep         = ( 1 << iSubShift );
434
425k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
435
425k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
436
437
425k
  Distortion uiSum = 0;
438
439
9.44M
  for( ; iRows != 0; iRows -= iSubStep )
440
9.02M
  {
441
9.02M
    uiSum += abs( piOrg[0] - piCur[0] );
442
9.02M
    uiSum += abs( piOrg[1] - piCur[1] );
443
9.02M
    uiSum += abs( piOrg[2] - piCur[2] );
444
9.02M
    uiSum += abs( piOrg[3] - piCur[3] );
445
9.02M
    uiSum += abs( piOrg[4] - piCur[4] );
446
9.02M
    uiSum += abs( piOrg[5] - piCur[5] );
447
9.02M
    uiSum += abs( piOrg[6] - piCur[6] );
448
9.02M
    uiSum += abs( piOrg[7] - piCur[7] );
449
9.02M
    uiSum += abs( piOrg[8] - piCur[8] );
450
9.02M
    uiSum += abs( piOrg[9] - piCur[9] );
451
9.02M
    uiSum += abs( piOrg[10] - piCur[10] );
452
9.02M
    uiSum += abs( piOrg[11] - piCur[11] );
453
9.02M
    uiSum += abs( piOrg[12] - piCur[12] );
454
9.02M
    uiSum += abs( piOrg[13] - piCur[13] );
455
9.02M
    uiSum += abs( piOrg[14] - piCur[14] );
456
9.02M
    uiSum += abs( piOrg[15] - piCur[15] );
457
458
9.02M
    piOrg += iStrideOrg;
459
9.02M
    piCur += iStrideCur;
460
9.02M
  }
461
462
425k
  uiSum <<= iSubShift;
463
425k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
464
425k
}
465
466
467
Distortion RdCost::xGetSAD128( const DistParam &rcDtParam )
468
0
{
469
0
  const Pel* piOrg  = rcDtParam.org.buf;
470
0
  const Pel* piCur  = rcDtParam.cur.buf;
471
0
  int  iRows        = rcDtParam.org.height;
472
0
  int  iCols        = rcDtParam.org.width;
473
0
  int  iSubShift    = rcDtParam.subShift;
474
0
  int  iSubStep     = ( 1 << iSubShift );
475
0
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
476
0
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;
477
478
0
  Distortion uiSum = 0;
479
480
0
  for( ; iRows != 0; iRows-=iSubStep )
481
0
  {
482
0
    for (int n = 0; n < iCols; n+=16 )
483
0
    {
484
0
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
485
0
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
486
0
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
487
0
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
488
0
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
489
0
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
490
0
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
491
0
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
492
0
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
493
0
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
494
0
      uiSum += abs( piOrg[n+10] - piCur[n+10] );
495
0
      uiSum += abs( piOrg[n+11] - piCur[n+11] );
496
0
      uiSum += abs( piOrg[n+12] - piCur[n+12] );
497
0
      uiSum += abs( piOrg[n+13] - piCur[n+13] );
498
0
      uiSum += abs( piOrg[n+14] - piCur[n+14] );
499
0
      uiSum += abs( piOrg[n+15] - piCur[n+15] );
500
0
    }
501
0
    piOrg += iStrideOrg;
502
0
    piCur += iStrideCur;
503
0
  }
504
505
0
  uiSum <<= iSubShift;
506
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
507
0
}
508
509
Distortion RdCost::xGetSAD32( const DistParam &rcDtParam )
510
604k
{
511
604k
  if ( rcDtParam.applyWeight )
512
0
  {
513
0
    THROW(" no support");
514
0
  }
515
516
604k
  const Pel* piOrg      = rcDtParam.org.buf;
517
604k
  const Pel* piCur      = rcDtParam.cur.buf;
518
604k
  int  iRows            = rcDtParam.org.height;
519
604k
  int  iSubShift        = rcDtParam.subShift;
520
604k
  int  iSubStep         = ( 1 << iSubShift );
521
604k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
522
604k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
523
524
604k
  Distortion uiSum = 0;
525
526
12.1M
  for( ; iRows != 0; iRows-=iSubStep )
527
11.5M
  {
528
11.5M
    uiSum += abs( piOrg[0] - piCur[0] );
529
11.5M
    uiSum += abs( piOrg[1] - piCur[1] );
530
11.5M
    uiSum += abs( piOrg[2] - piCur[2] );
531
11.5M
    uiSum += abs( piOrg[3] - piCur[3] );
532
11.5M
    uiSum += abs( piOrg[4] - piCur[4] );
533
11.5M
    uiSum += abs( piOrg[5] - piCur[5] );
534
11.5M
    uiSum += abs( piOrg[6] - piCur[6] );
535
11.5M
    uiSum += abs( piOrg[7] - piCur[7] );
536
11.5M
    uiSum += abs( piOrg[8] - piCur[8] );
537
11.5M
    uiSum += abs( piOrg[9] - piCur[9] );
538
11.5M
    uiSum += abs( piOrg[10] - piCur[10] );
539
11.5M
    uiSum += abs( piOrg[11] - piCur[11] );
540
11.5M
    uiSum += abs( piOrg[12] - piCur[12] );
541
11.5M
    uiSum += abs( piOrg[13] - piCur[13] );
542
11.5M
    uiSum += abs( piOrg[14] - piCur[14] );
543
11.5M
    uiSum += abs( piOrg[15] - piCur[15] );
544
11.5M
    uiSum += abs( piOrg[16] - piCur[16] );
545
11.5M
    uiSum += abs( piOrg[17] - piCur[17] );
546
11.5M
    uiSum += abs( piOrg[18] - piCur[18] );
547
11.5M
    uiSum += abs( piOrg[19] - piCur[19] );
548
11.5M
    uiSum += abs( piOrg[20] - piCur[20] );
549
11.5M
    uiSum += abs( piOrg[21] - piCur[21] );
550
11.5M
    uiSum += abs( piOrg[22] - piCur[22] );
551
11.5M
    uiSum += abs( piOrg[23] - piCur[23] );
552
11.5M
    uiSum += abs( piOrg[24] - piCur[24] );
553
11.5M
    uiSum += abs( piOrg[25] - piCur[25] );
554
11.5M
    uiSum += abs( piOrg[26] - piCur[26] );
555
11.5M
    uiSum += abs( piOrg[27] - piCur[27] );
556
11.5M
    uiSum += abs( piOrg[28] - piCur[28] );
557
11.5M
    uiSum += abs( piOrg[29] - piCur[29] );
558
11.5M
    uiSum += abs( piOrg[30] - piCur[30] );
559
11.5M
    uiSum += abs( piOrg[31] - piCur[31] );
560
561
11.5M
    piOrg += iStrideOrg;
562
11.5M
    piCur += iStrideCur;
563
11.5M
  }
564
565
604k
  uiSum <<= iSubShift;
566
604k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
567
604k
}
568
569
570
Distortion RdCost::xGetSAD64( const DistParam &rcDtParam )
571
7.53k
{
572
7.53k
  if ( rcDtParam.applyWeight )
573
0
  {
574
0
    THROW(" no support");
575
0
  }
576
577
7.53k
  const Pel* piOrg      = rcDtParam.org.buf;
578
7.53k
  const Pel* piCur      = rcDtParam.cur.buf;
579
7.53k
  int  iRows            = rcDtParam.org.height;
580
7.53k
  int  iSubShift        = rcDtParam.subShift;
581
7.53k
  int  iSubStep         = ( 1 << iSubShift );
582
7.53k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
583
7.53k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
584
585
7.53k
  Distortion uiSum = 0;
586
587
489k
  for( ; iRows != 0; iRows-=iSubStep )
588
482k
  {
589
482k
    uiSum += abs( piOrg[0] - piCur[0] );
590
482k
    uiSum += abs( piOrg[1] - piCur[1] );
591
482k
    uiSum += abs( piOrg[2] - piCur[2] );
592
482k
    uiSum += abs( piOrg[3] - piCur[3] );
593
482k
    uiSum += abs( piOrg[4] - piCur[4] );
594
482k
    uiSum += abs( piOrg[5] - piCur[5] );
595
482k
    uiSum += abs( piOrg[6] - piCur[6] );
596
482k
    uiSum += abs( piOrg[7] - piCur[7] );
597
482k
    uiSum += abs( piOrg[8] - piCur[8] );
598
482k
    uiSum += abs( piOrg[9] - piCur[9] );
599
482k
    uiSum += abs( piOrg[10] - piCur[10] );
600
482k
    uiSum += abs( piOrg[11] - piCur[11] );
601
482k
    uiSum += abs( piOrg[12] - piCur[12] );
602
482k
    uiSum += abs( piOrg[13] - piCur[13] );
603
482k
    uiSum += abs( piOrg[14] - piCur[14] );
604
482k
    uiSum += abs( piOrg[15] - piCur[15] );
605
482k
    uiSum += abs( piOrg[16] - piCur[16] );
606
482k
    uiSum += abs( piOrg[17] - piCur[17] );
607
482k
    uiSum += abs( piOrg[18] - piCur[18] );
608
482k
    uiSum += abs( piOrg[19] - piCur[19] );
609
482k
    uiSum += abs( piOrg[20] - piCur[20] );
610
482k
    uiSum += abs( piOrg[21] - piCur[21] );
611
482k
    uiSum += abs( piOrg[22] - piCur[22] );
612
482k
    uiSum += abs( piOrg[23] - piCur[23] );
613
482k
    uiSum += abs( piOrg[24] - piCur[24] );
614
482k
    uiSum += abs( piOrg[25] - piCur[25] );
615
482k
    uiSum += abs( piOrg[26] - piCur[26] );
616
482k
    uiSum += abs( piOrg[27] - piCur[27] );
617
482k
    uiSum += abs( piOrg[28] - piCur[28] );
618
482k
    uiSum += abs( piOrg[29] - piCur[29] );
619
482k
    uiSum += abs( piOrg[30] - piCur[30] );
620
482k
    uiSum += abs( piOrg[31] - piCur[31] );
621
482k
    uiSum += abs( piOrg[32] - piCur[32] );
622
482k
    uiSum += abs( piOrg[33] - piCur[33] );
623
482k
    uiSum += abs( piOrg[34] - piCur[34] );
624
482k
    uiSum += abs( piOrg[35] - piCur[35] );
625
482k
    uiSum += abs( piOrg[36] - piCur[36] );
626
482k
    uiSum += abs( piOrg[37] - piCur[37] );
627
482k
    uiSum += abs( piOrg[38] - piCur[38] );
628
482k
    uiSum += abs( piOrg[39] - piCur[39] );
629
482k
    uiSum += abs( piOrg[40] - piCur[40] );
630
482k
    uiSum += abs( piOrg[41] - piCur[41] );
631
482k
    uiSum += abs( piOrg[42] - piCur[42] );
632
482k
    uiSum += abs( piOrg[43] - piCur[43] );
633
482k
    uiSum += abs( piOrg[44] - piCur[44] );
634
482k
    uiSum += abs( piOrg[45] - piCur[45] );
635
482k
    uiSum += abs( piOrg[46] - piCur[46] );
636
482k
    uiSum += abs( piOrg[47] - piCur[47] );
637
482k
    uiSum += abs( piOrg[48] - piCur[48] );
638
482k
    uiSum += abs( piOrg[49] - piCur[49] );
639
482k
    uiSum += abs( piOrg[50] - piCur[50] );
640
482k
    uiSum += abs( piOrg[51] - piCur[51] );
641
482k
    uiSum += abs( piOrg[52] - piCur[52] );
642
482k
    uiSum += abs( piOrg[53] - piCur[53] );
643
482k
    uiSum += abs( piOrg[54] - piCur[54] );
644
482k
    uiSum += abs( piOrg[55] - piCur[55] );
645
482k
    uiSum += abs( piOrg[56] - piCur[56] );
646
482k
    uiSum += abs( piOrg[57] - piCur[57] );
647
482k
    uiSum += abs( piOrg[58] - piCur[58] );
648
482k
    uiSum += abs( piOrg[59] - piCur[59] );
649
482k
    uiSum += abs( piOrg[60] - piCur[60] );
650
482k
    uiSum += abs( piOrg[61] - piCur[61] );
651
482k
    uiSum += abs( piOrg[62] - piCur[62] );
652
482k
    uiSum += abs( piOrg[63] - piCur[63] );
653
654
482k
    piOrg += iStrideOrg;
655
482k
    piCur += iStrideCur;
656
482k
  }
657
658
7.53k
  uiSum <<= iSubShift;
659
7.53k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
660
7.53k
}
661
662
663
// --------------------------------------------------------------------------------------------------------------------
664
// SSE
665
// --------------------------------------------------------------------------------------------------------------------
666
667
Distortion RdCost::xGetSSE( const DistParam &rcDtParam )
668
0
{
669
0
  if ( rcDtParam.applyWeight )
670
0
  {
671
0
    THROW(" no support");
672
0
  }
673
674
0
  const Pel* piOrg      = rcDtParam.org.buf;
675
0
  const Pel* piCur      = rcDtParam.cur.buf;
676
0
  int  iRows            = rcDtParam.org.height;
677
0
  int  iCols            = rcDtParam.org.width;
678
0
  int  iStrideCur       = rcDtParam.cur.stride;
679
0
  int  iStrideOrg       = rcDtParam.org.stride;
680
681
0
  Distortion uiSum   = 0;
682
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
683
684
0
  Intermediate_Int iTemp;
685
686
0
  for( ; iRows != 0; iRows-- )
687
0
  {
688
0
    for (int n = 0; n < iCols; n++ )
689
0
    {
690
0
      iTemp = piOrg[n  ] - piCur[n  ];
691
0
      uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
692
0
    }
693
0
    piOrg += iStrideOrg;
694
0
    piCur += iStrideCur;
695
0
  }
696
697
0
  return ( uiSum );
698
0
}
699
700
Distortion RdCost::xGetSSE4( const DistParam &rcDtParam )
701
511k
{
702
511k
  if ( rcDtParam.applyWeight )
703
0
  {
704
0
    CHECK( rcDtParam.org.width != 4, "Invalid size" );
705
0
    THROW(" no support");
706
0
  }
707
708
511k
  const Pel* piOrg   = rcDtParam.org.buf;
709
511k
  const Pel* piCur   = rcDtParam.cur.buf;
710
511k
  int  iRows         = rcDtParam.org.height;
711
511k
  int  iStrideOrg    = rcDtParam.org.stride;
712
511k
  int  iStrideCur    = rcDtParam.cur.stride;
713
714
511k
  Distortion uiSum   = 0;
715
511k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
716
717
511k
  Intermediate_Int  iTemp;
718
719
7.14M
  for( ; iRows != 0; iRows-- )
720
6.63M
  {
721
722
6.63M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
723
6.63M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
724
6.63M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
725
6.63M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
726
727
6.63M
    piOrg += iStrideOrg;
728
6.63M
    piCur += iStrideCur;
729
6.63M
  }
730
731
511k
  return ( uiSum );
732
511k
}
733
734
Distortion RdCost::xGetSSE8( const DistParam &rcDtParam )
735
767k
{
736
767k
  if ( rcDtParam.applyWeight )
737
0
  {
738
0
    CHECK( rcDtParam.org.width != 8, "Invalid size" );
739
0
    THROW(" no support");
740
0
  }
741
742
767k
  const Pel* piOrg   = rcDtParam.org.buf;
743
767k
  const Pel* piCur   = rcDtParam.cur.buf;
744
767k
  int  iRows         = rcDtParam.org.height;
745
767k
  int  iStrideOrg    = rcDtParam.org.stride;
746
767k
  int  iStrideCur    = rcDtParam.cur.stride;
747
748
767k
  Distortion uiSum   = 0;
749
767k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
750
751
767k
  Intermediate_Int  iTemp;
752
753
11.8M
  for( ; iRows != 0; iRows-- )
754
11.0M
  {
755
11.0M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
756
11.0M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
757
11.0M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
758
11.0M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
759
11.0M
    iTemp = piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
760
11.0M
    iTemp = piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
761
11.0M
    iTemp = piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
762
11.0M
    iTemp = piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
763
764
11.0M
    piOrg += iStrideOrg;
765
11.0M
    piCur += iStrideCur;
766
11.0M
  }
767
768
767k
  return ( uiSum );
769
767k
}
770
771
Distortion RdCost::xGetSSE16( const DistParam &rcDtParam )
772
621k
{
773
621k
  if ( rcDtParam.applyWeight )
774
0
  {
775
0
    CHECK( rcDtParam.org.width != 16, "Invalid size" );
776
0
    THROW(" no support");
777
0
  }
778
779
621k
  const Pel* piOrg   = rcDtParam.org.buf;
780
621k
  const Pel* piCur   = rcDtParam.cur.buf;
781
621k
  int  iRows         = rcDtParam.org.height;
782
621k
  int  iStrideOrg    = rcDtParam.org.stride;
783
621k
  int  iStrideCur    = rcDtParam.cur.stride;
784
785
621k
  Distortion uiSum   = 0;
786
621k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
787
788
621k
  Intermediate_Int  iTemp;
789
790
9.62M
  for( ; iRows != 0; iRows-- )
791
8.99M
  {
792
793
8.99M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
794
8.99M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
795
8.99M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
796
8.99M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
797
8.99M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
798
8.99M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
799
8.99M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
800
8.99M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
801
8.99M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
802
8.99M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
803
8.99M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
804
8.99M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
805
8.99M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
806
8.99M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
807
8.99M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
808
8.99M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
809
810
8.99M
    piOrg += iStrideOrg;
811
8.99M
    piCur += iStrideCur;
812
8.99M
  }
813
814
621k
  return ( uiSum );
815
621k
}
816
817
Distortion RdCost::xGetSSE128( const DistParam &rcDtParam )
818
0
{
819
0
  if ( rcDtParam.applyWeight )
820
0
  {
821
0
    THROW(" no support");
822
0
  }
823
0
  const Pel* piOrg   = rcDtParam.org.buf;
824
0
  const Pel* piCur   = rcDtParam.cur.buf;
825
0
  int  iRows         = rcDtParam.org.height;
826
0
  int  iCols         = rcDtParam.org.width;
827
0
  int  iStrideOrg    = rcDtParam.org.stride;
828
0
  int  iStrideCur    = rcDtParam.cur.stride;
829
830
0
  Distortion uiSum   = 0;
831
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
832
833
0
  Intermediate_Int  iTemp;
834
835
0
  for( ; iRows != 0; iRows-- )
836
0
  {
837
0
    for (int n = 0; n < iCols; n+=16 )
838
0
    {
839
840
0
      iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
841
0
      iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
842
0
      iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
843
0
      iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
844
0
      iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
845
0
      iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
846
0
      iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
847
0
      iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
848
0
      iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
849
0
      iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
850
0
      iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
851
0
      iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
852
0
      iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
853
0
      iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
854
0
      iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
855
0
      iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
856
857
0
    }
858
0
    piOrg += iStrideOrg;
859
0
    piCur += iStrideCur;
860
0
  }
861
862
0
  return ( uiSum );
863
0
}
864
865
Distortion RdCost::xGetSSE32( const DistParam &rcDtParam )
866
498k
{
867
498k
  if ( rcDtParam.applyWeight )
868
0
  {
869
0
    THROW(" no support");
870
0
  }
871
872
498k
  const Pel* piOrg   = rcDtParam.org.buf;
873
498k
  const Pel* piCur   = rcDtParam.cur.buf;
874
498k
  int  iRows         = rcDtParam.org.height;
875
498k
  int  iStrideOrg    = rcDtParam.org.stride;
876
498k
  int  iStrideCur    = rcDtParam.cur.stride;
877
878
498k
  Distortion uiSum   = 0;
879
498k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
880
881
498k
  Intermediate_Int  iTemp;
882
883
8.76M
  for( ; iRows != 0; iRows-- )
884
8.27M
  {
885
886
8.27M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
887
8.27M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
888
8.27M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
889
8.27M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
890
8.27M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
891
8.27M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
892
8.27M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
893
8.27M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
894
8.27M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
895
8.27M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
896
8.27M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
897
8.27M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
898
8.27M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
899
8.27M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
900
8.27M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
901
8.27M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
902
8.27M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
903
8.27M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
904
8.27M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
905
8.27M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
906
8.27M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
907
8.27M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
908
8.27M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
909
8.27M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
910
8.27M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
911
8.27M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
912
8.27M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
913
8.27M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
914
8.27M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
915
8.27M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
916
8.27M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
917
8.27M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
918
919
8.27M
    piOrg += iStrideOrg;
920
8.27M
    piCur += iStrideCur;
921
8.27M
  }
922
923
498k
  return ( uiSum );
924
498k
}
925
926
Distortion RdCost::xGetSSE64( const DistParam &rcDtParam )
927
59.3k
{
928
59.3k
  if ( rcDtParam.applyWeight )
929
0
  {
930
0
    THROW(" no support");
931
0
  }
932
933
59.3k
  const Pel* piOrg   = rcDtParam.org.buf;
934
59.3k
  const Pel* piCur   = rcDtParam.cur.buf;
935
59.3k
  int  iRows         = rcDtParam.org.height;
936
59.3k
  int  iStrideOrg    = rcDtParam.org.stride;
937
59.3k
  int  iStrideCur    = rcDtParam.cur.stride;
938
939
59.3k
  Distortion uiSum   = 0;
940
59.3k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
941
942
59.3k
  Intermediate_Int  iTemp;
943
944
3.14M
  for( ; iRows != 0; iRows-- )
945
3.08M
  {
946
3.08M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
947
3.08M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
948
3.08M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
949
3.08M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
950
3.08M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
951
3.08M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
952
3.08M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
953
3.08M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
954
3.08M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
955
3.08M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
956
3.08M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
957
3.08M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
958
3.08M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
959
3.08M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
960
3.08M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
961
3.08M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
962
3.08M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
963
3.08M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
964
3.08M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
965
3.08M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
966
3.08M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
967
3.08M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
968
3.08M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
969
3.08M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
970
3.08M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
971
3.08M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
972
3.08M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
973
3.08M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
974
3.08M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
975
3.08M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
976
3.08M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
977
3.08M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
978
3.08M
    iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
979
3.08M
    iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
980
3.08M
    iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
981
3.08M
    iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
982
3.08M
    iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
983
3.08M
    iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
984
3.08M
    iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
985
3.08M
    iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
986
3.08M
    iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
987
3.08M
    iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
988
3.08M
    iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
989
3.08M
    iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
990
3.08M
    iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
991
3.08M
    iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
992
3.08M
    iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
993
3.08M
    iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
994
3.08M
    iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
995
3.08M
    iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
996
3.08M
    iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
997
3.08M
    iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
998
3.08M
    iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
999
3.08M
    iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1000
3.08M
    iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1001
3.08M
    iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1002
3.08M
    iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1003
3.08M
    iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1004
3.08M
    iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1005
3.08M
    iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1006
3.08M
    iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1007
3.08M
    iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1008
3.08M
    iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1009
3.08M
    iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1010
1011
3.08M
    piOrg += iStrideOrg;
1012
3.08M
    piCur += iStrideCur;
1013
3.08M
  }
1014
1015
59.3k
  return ( uiSum );
1016
59.3k
}
1017
1018
// --------------------------------------------------------------------------------------------------------------------
1019
// HADAMARD with step (used in fractional search)
1020
// --------------------------------------------------------------------------------------------------------------------
1021
1022
Distortion RdCost::xCalcHADs2x2( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1023
150k
{
1024
150k
  Distortion satd = 0;
1025
150k
  TCoeff diff[4], m[4];
1026
1027
150k
  diff[0] = piOrg[0             ] - piCur[0];
1028
150k
  diff[1] = piOrg[1             ] - piCur[1];
1029
150k
  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
1030
150k
  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
1031
150k
  m[0] = diff[0] + diff[2];
1032
150k
  m[1] = diff[1] + diff[3];
1033
150k
  m[2] = diff[0] - diff[2];
1034
150k
  m[3] = diff[1] - diff[3];
1035
  
1036
150k
  satd += abs(m[0] + m[1]) >> 2;
1037
150k
  satd += abs(m[0] - m[1]);
1038
150k
  satd += abs(m[2] + m[3]);
1039
150k
  satd += abs(m[2] - m[3]);
1040
1041
150k
  return satd;
1042
150k
}
1043
1044
static Distortion xCalcHADs4x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1045
41.1k
{
1046
41.1k
  int k;
1047
41.1k
  Distortion satd = 0;
1048
41.1k
  TCoeff diff[16], m[16], d[16];
1049
1050
205k
  for( k = 0; k < 16; k+=4 )
1051
164k
  {
1052
164k
    diff[k+0] = piOrg[0] - piCur[0];
1053
164k
    diff[k+1] = piOrg[1] - piCur[1];
1054
164k
    diff[k+2] = piOrg[2] - piCur[2];
1055
164k
    diff[k+3] = piOrg[3] - piCur[3];
1056
1057
164k
    piCur += iStrideCur;
1058
164k
    piOrg += iStrideOrg;
1059
164k
  }
1060
1061
  /*===== hadamard transform =====*/
1062
41.1k
  m[ 0] = diff[ 0] + diff[12];
1063
41.1k
  m[ 1] = diff[ 1] + diff[13];
1064
41.1k
  m[ 2] = diff[ 2] + diff[14];
1065
41.1k
  m[ 3] = diff[ 3] + diff[15];
1066
41.1k
  m[ 4] = diff[ 4] + diff[ 8];
1067
41.1k
  m[ 5] = diff[ 5] + diff[ 9];
1068
41.1k
  m[ 6] = diff[ 6] + diff[10];
1069
41.1k
  m[ 7] = diff[ 7] + diff[11];
1070
41.1k
  m[ 8] = diff[ 4] - diff[ 8];
1071
41.1k
  m[ 9] = diff[ 5] - diff[ 9];
1072
41.1k
  m[10] = diff[ 6] - diff[10];
1073
41.1k
  m[11] = diff[ 7] - diff[11];
1074
41.1k
  m[12] = diff[ 0] - diff[12];
1075
41.1k
  m[13] = diff[ 1] - diff[13];
1076
41.1k
  m[14] = diff[ 2] - diff[14];
1077
41.1k
  m[15] = diff[ 3] - diff[15];
1078
1079
41.1k
  d[ 0] = m[ 0] + m[ 4];
1080
41.1k
  d[ 1] = m[ 1] + m[ 5];
1081
41.1k
  d[ 2] = m[ 2] + m[ 6];
1082
41.1k
  d[ 3] = m[ 3] + m[ 7];
1083
41.1k
  d[ 4] = m[ 8] + m[12];
1084
41.1k
  d[ 5] = m[ 9] + m[13];
1085
41.1k
  d[ 6] = m[10] + m[14];
1086
41.1k
  d[ 7] = m[11] + m[15];
1087
41.1k
  d[ 8] = m[ 0] - m[ 4];
1088
41.1k
  d[ 9] = m[ 1] - m[ 5];
1089
41.1k
  d[10] = m[ 2] - m[ 6];
1090
41.1k
  d[11] = m[ 3] - m[ 7];
1091
41.1k
  d[12] = m[12] - m[ 8];
1092
41.1k
  d[13] = m[13] - m[ 9];
1093
41.1k
  d[14] = m[14] - m[10];
1094
41.1k
  d[15] = m[15] - m[11];
1095
1096
41.1k
  m[ 0] = d[ 0] + d[ 3];
1097
41.1k
  m[ 1] = d[ 1] + d[ 2];
1098
41.1k
  m[ 2] = d[ 1] - d[ 2];
1099
41.1k
  m[ 3] = d[ 0] - d[ 3];
1100
41.1k
  m[ 4] = d[ 4] + d[ 7];
1101
41.1k
  m[ 5] = d[ 5] + d[ 6];
1102
41.1k
  m[ 6] = d[ 5] - d[ 6];
1103
41.1k
  m[ 7] = d[ 4] - d[ 7];
1104
41.1k
  m[ 8] = d[ 8] + d[11];
1105
41.1k
  m[ 9] = d[ 9] + d[10];
1106
41.1k
  m[10] = d[ 9] - d[10];
1107
41.1k
  m[11] = d[ 8] - d[11];
1108
41.1k
  m[12] = d[12] + d[15];
1109
41.1k
  m[13] = d[13] + d[14];
1110
41.1k
  m[14] = d[13] - d[14];
1111
41.1k
  m[15] = d[12] - d[15];
1112
1113
41.1k
  d[ 0] = m[ 0] + m[ 1];
1114
41.1k
  d[ 1] = m[ 0] - m[ 1];
1115
41.1k
  d[ 2] = m[ 2] + m[ 3];
1116
41.1k
  d[ 3] = m[ 3] - m[ 2];
1117
41.1k
  d[ 4] = m[ 4] + m[ 5];
1118
41.1k
  d[ 5] = m[ 4] - m[ 5];
1119
41.1k
  d[ 6] = m[ 6] + m[ 7];
1120
41.1k
  d[ 7] = m[ 7] - m[ 6];
1121
41.1k
  d[ 8] = m[ 8] + m[ 9];
1122
41.1k
  d[ 9] = m[ 8] - m[ 9];
1123
41.1k
  d[10] = m[10] + m[11];
1124
41.1k
  d[11] = m[11] - m[10];
1125
41.1k
  d[12] = m[12] + m[13];
1126
41.1k
  d[13] = m[12] - m[13];
1127
41.1k
  d[14] = m[14] + m[15];
1128
41.1k
  d[15] = m[15] - m[14];
1129
1130
699k
  for (k=0; k<16; ++k)
1131
658k
  {
1132
658k
    satd += abs(d[k]);
1133
658k
  }
1134
1135
41.1k
  satd -= abs( d[0] );
1136
41.1k
  satd += abs( d[0] ) >> 2;
1137
41.1k
  satd = ((satd+1)>>1);
1138
1139
41.1k
  return satd;
1140
41.1k
}
1141
1142
static Distortion xCalcHADs16x16_fast( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1143
0
{
1144
0
  int k, i, j, jj;
1145
0
  Distortion sad = 0;
1146
0
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1147
1148
0
  for( k = 0; k < 64; k += 8 )
1149
0
  {
1150
0
    diff[k+0] = ( ( piOrg[ 0] + piOrg[ 0+1] + piOrg[ 0+iStrideOrg] + piOrg[ 0+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 0] + piCur[ 0+1] + piCur[ 0+iStrideCur] + piCur[ 0+1+iStrideCur] + 2 ) >> 2 );
1151
0
    diff[k+1] = ( ( piOrg[ 2] + piOrg[ 2+1] + piOrg[ 2+iStrideOrg] + piOrg[ 2+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 2] + piCur[ 2+1] + piCur[ 2+iStrideCur] + piCur[ 2+1+iStrideCur] + 2 ) >> 2 );
1152
0
    diff[k+2] = ( ( piOrg[ 4] + piOrg[ 4+1] + piOrg[ 4+iStrideOrg] + piOrg[ 4+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 4] + piCur[ 4+1] + piCur[ 4+iStrideCur] + piCur[ 4+1+iStrideCur] + 2 ) >> 2 );
1153
0
    diff[k+3] = ( ( piOrg[ 6] + piOrg[ 6+1] + piOrg[ 6+iStrideOrg] + piOrg[ 6+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 6] + piCur[ 6+1] + piCur[ 6+iStrideCur] + piCur[ 6+1+iStrideCur] + 2 ) >> 2 );
1154
0
    diff[k+4] = ( ( piOrg[ 8] + piOrg[ 8+1] + piOrg[ 8+iStrideOrg] + piOrg[ 8+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 8] + piCur[ 8+1] + piCur[ 8+iStrideCur] + piCur[ 8+1+iStrideCur] + 2 ) >> 2 );
1155
0
    diff[k+5] = ( ( piOrg[10] + piOrg[10+1] + piOrg[10+iStrideOrg] + piOrg[10+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[10] + piCur[10+1] + piCur[10+iStrideCur] + piCur[10+1+iStrideCur] + 2 ) >> 2 );
1156
0
    diff[k+6] = ( ( piOrg[12] + piOrg[12+1] + piOrg[12+iStrideOrg] + piOrg[12+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[12] + piCur[12+1] + piCur[12+iStrideCur] + piCur[12+1+iStrideCur] + 2 ) >> 2 );
1157
0
    diff[k+7] = ( ( piOrg[14] + piOrg[14+1] + piOrg[14+iStrideOrg] + piOrg[14+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[14] + piCur[14+1] + piCur[14+iStrideCur] + piCur[14+1+iStrideCur] + 2 ) >> 2 );
1158
1159
0
    piCur += 2 * iStrideCur;
1160
0
    piOrg += 2 * iStrideOrg;
1161
0
  }
1162
1163
  //horizontal
1164
0
  for (j=0; j < 8; j++)
1165
0
  {
1166
0
    jj = j << 3;
1167
0
    m2[j][0] = diff[jj  ] + diff[jj+4];
1168
0
    m2[j][1] = diff[jj+1] + diff[jj+5];
1169
0
    m2[j][2] = diff[jj+2] + diff[jj+6];
1170
0
    m2[j][3] = diff[jj+3] + diff[jj+7];
1171
0
    m2[j][4] = diff[jj  ] - diff[jj+4];
1172
0
    m2[j][5] = diff[jj+1] - diff[jj+5];
1173
0
    m2[j][6] = diff[jj+2] - diff[jj+6];
1174
0
    m2[j][7] = diff[jj+3] - diff[jj+7];
1175
1176
0
    m1[j][0] = m2[j][0] + m2[j][2];
1177
0
    m1[j][1] = m2[j][1] + m2[j][3];
1178
0
    m1[j][2] = m2[j][0] - m2[j][2];
1179
0
    m1[j][3] = m2[j][1] - m2[j][3];
1180
0
    m1[j][4] = m2[j][4] + m2[j][6];
1181
0
    m1[j][5] = m2[j][5] + m2[j][7];
1182
0
    m1[j][6] = m2[j][4] - m2[j][6];
1183
0
    m1[j][7] = m2[j][5] - m2[j][7];
1184
1185
0
    m2[j][0] = m1[j][0] + m1[j][1];
1186
0
    m2[j][1] = m1[j][0] - m1[j][1];
1187
0
    m2[j][2] = m1[j][2] + m1[j][3];
1188
0
    m2[j][3] = m1[j][2] - m1[j][3];
1189
0
    m2[j][4] = m1[j][4] + m1[j][5];
1190
0
    m2[j][5] = m1[j][4] - m1[j][5];
1191
0
    m2[j][6] = m1[j][6] + m1[j][7];
1192
0
    m2[j][7] = m1[j][6] - m1[j][7];
1193
0
  }
1194
1195
  //vertical
1196
0
  for (i=0; i < 8; i++)
1197
0
  {
1198
0
    m3[0][i] = m2[0][i] + m2[4][i];
1199
0
    m3[1][i] = m2[1][i] + m2[5][i];
1200
0
    m3[2][i] = m2[2][i] + m2[6][i];
1201
0
    m3[3][i] = m2[3][i] + m2[7][i];
1202
0
    m3[4][i] = m2[0][i] - m2[4][i];
1203
0
    m3[5][i] = m2[1][i] - m2[5][i];
1204
0
    m3[6][i] = m2[2][i] - m2[6][i];
1205
0
    m3[7][i] = m2[3][i] - m2[7][i];
1206
1207
0
    m1[0][i] = m3[0][i] + m3[2][i];
1208
0
    m1[1][i] = m3[1][i] + m3[3][i];
1209
0
    m1[2][i] = m3[0][i] - m3[2][i];
1210
0
    m1[3][i] = m3[1][i] - m3[3][i];
1211
0
    m1[4][i] = m3[4][i] + m3[6][i];
1212
0
    m1[5][i] = m3[5][i] + m3[7][i];
1213
0
    m1[6][i] = m3[4][i] - m3[6][i];
1214
0
    m1[7][i] = m3[5][i] - m3[7][i];
1215
1216
0
    m2[0][i] = m1[0][i] + m1[1][i];
1217
0
    m2[1][i] = m1[0][i] - m1[1][i];
1218
0
    m2[2][i] = m1[2][i] + m1[3][i];
1219
0
    m2[3][i] = m1[2][i] - m1[3][i];
1220
0
    m2[4][i] = m1[4][i] + m1[5][i];
1221
0
    m2[5][i] = m1[4][i] - m1[5][i];
1222
0
    m2[6][i] = m1[6][i] + m1[7][i];
1223
0
    m2[7][i] = m1[6][i] - m1[7][i];
1224
0
  }
1225
1226
0
  for (i = 0; i < 8; i++)
1227
0
  {
1228
0
    for (j = 0; j < 8; j++)
1229
0
    {
1230
0
      sad += abs(m2[i][j]);
1231
0
    }
1232
0
  }
1233
  
1234
0
  sad -= abs( m2[0][0] );
1235
0
  sad += abs( m2[0][0] ) >> 2;
1236
0
  sad=((sad+2)>>2);
1237
1238
0
  return (sad << 2);
1239
0
}
1240
1241
static Distortion xCalcHADs8x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1242
14.1M
{
1243
14.1M
  int k, i, j, jj;
1244
14.1M
  Distortion sad = 0;
1245
14.1M
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1246
1247
127M
  for( k = 0; k < 64; k += 8 )
1248
113M
  {
1249
113M
    diff[k+0] = piOrg[0] - piCur[0];
1250
113M
    diff[k+1] = piOrg[1] - piCur[1];
1251
113M
    diff[k+2] = piOrg[2] - piCur[2];
1252
113M
    diff[k+3] = piOrg[3] - piCur[3];
1253
113M
    diff[k+4] = piOrg[4] - piCur[4];
1254
113M
    diff[k+5] = piOrg[5] - piCur[5];
1255
113M
    diff[k+6] = piOrg[6] - piCur[6];
1256
113M
    diff[k+7] = piOrg[7] - piCur[7];
1257
1258
113M
    piCur += iStrideCur;
1259
113M
    piOrg += iStrideOrg;
1260
113M
  }
1261
1262
  //horizontal
1263
127M
  for (j=0; j < 8; j++)
1264
113M
  {
1265
113M
    jj = j << 3;
1266
113M
    m2[j][0] = diff[jj  ] + diff[jj+4];
1267
113M
    m2[j][1] = diff[jj+1] + diff[jj+5];
1268
113M
    m2[j][2] = diff[jj+2] + diff[jj+6];
1269
113M
    m2[j][3] = diff[jj+3] + diff[jj+7];
1270
113M
    m2[j][4] = diff[jj  ] - diff[jj+4];
1271
113M
    m2[j][5] = diff[jj+1] - diff[jj+5];
1272
113M
    m2[j][6] = diff[jj+2] - diff[jj+6];
1273
113M
    m2[j][7] = diff[jj+3] - diff[jj+7];
1274
1275
113M
    m1[j][0] = m2[j][0] + m2[j][2];
1276
113M
    m1[j][1] = m2[j][1] + m2[j][3];
1277
113M
    m1[j][2] = m2[j][0] - m2[j][2];
1278
113M
    m1[j][3] = m2[j][1] - m2[j][3];
1279
113M
    m1[j][4] = m2[j][4] + m2[j][6];
1280
113M
    m1[j][5] = m2[j][5] + m2[j][7];
1281
113M
    m1[j][6] = m2[j][4] - m2[j][6];
1282
113M
    m1[j][7] = m2[j][5] - m2[j][7];
1283
1284
113M
    m2[j][0] = m1[j][0] + m1[j][1];
1285
113M
    m2[j][1] = m1[j][0] - m1[j][1];
1286
113M
    m2[j][2] = m1[j][2] + m1[j][3];
1287
113M
    m2[j][3] = m1[j][2] - m1[j][3];
1288
113M
    m2[j][4] = m1[j][4] + m1[j][5];
1289
113M
    m2[j][5] = m1[j][4] - m1[j][5];
1290
113M
    m2[j][6] = m1[j][6] + m1[j][7];
1291
113M
    m2[j][7] = m1[j][6] - m1[j][7];
1292
113M
  }
1293
1294
  //vertical
1295
127M
  for (i=0; i < 8; i++)
1296
113M
  {
1297
113M
    m3[0][i] = m2[0][i] + m2[4][i];
1298
113M
    m3[1][i] = m2[1][i] + m2[5][i];
1299
113M
    m3[2][i] = m2[2][i] + m2[6][i];
1300
113M
    m3[3][i] = m2[3][i] + m2[7][i];
1301
113M
    m3[4][i] = m2[0][i] - m2[4][i];
1302
113M
    m3[5][i] = m2[1][i] - m2[5][i];
1303
113M
    m3[6][i] = m2[2][i] - m2[6][i];
1304
113M
    m3[7][i] = m2[3][i] - m2[7][i];
1305
1306
113M
    m1[0][i] = m3[0][i] + m3[2][i];
1307
113M
    m1[1][i] = m3[1][i] + m3[3][i];
1308
113M
    m1[2][i] = m3[0][i] - m3[2][i];
1309
113M
    m1[3][i] = m3[1][i] - m3[3][i];
1310
113M
    m1[4][i] = m3[4][i] + m3[6][i];
1311
113M
    m1[5][i] = m3[5][i] + m3[7][i];
1312
113M
    m1[6][i] = m3[4][i] - m3[6][i];
1313
113M
    m1[7][i] = m3[5][i] - m3[7][i];
1314
1315
113M
    m2[0][i] = m1[0][i] + m1[1][i];
1316
113M
    m2[1][i] = m1[0][i] - m1[1][i];
1317
113M
    m2[2][i] = m1[2][i] + m1[3][i];
1318
113M
    m2[3][i] = m1[2][i] - m1[3][i];
1319
113M
    m2[4][i] = m1[4][i] + m1[5][i];
1320
113M
    m2[5][i] = m1[4][i] - m1[5][i];
1321
113M
    m2[6][i] = m1[6][i] + m1[7][i];
1322
113M
    m2[7][i] = m1[6][i] - m1[7][i];
1323
113M
  }
1324
1325
127M
  for (i = 0; i < 8; i++)
1326
113M
  {
1327
1.01G
    for (j = 0; j < 8; j++)
1328
904M
    {
1329
904M
      sad += abs(m2[i][j]);
1330
904M
    }
1331
113M
  }
1332
  
1333
14.1M
  sad -= abs( m2[0][0] );
1334
14.1M
  sad += abs( m2[0][0] ) >> 2;
1335
14.1M
  sad=((sad+2)>>2);
1336
1337
14.1M
  return sad;
1338
14.1M
}
1339
1340
static Distortion xCalcHADs16x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1341
813k
{   //need to add SIMD implementation ,JCA
1342
813k
  int k, i, j, jj, sad = 0;
1343
813k
  int diff[128], m1[8][16], m2[8][16];
1344
7.31M
  for( k = 0; k < 128; k += 16 )
1345
6.50M
  {
1346
6.50M
    diff[k + 0] = piOrg[0] - piCur[0];
1347
6.50M
    diff[k + 1] = piOrg[1] - piCur[1];
1348
6.50M
    diff[k + 2] = piOrg[2] - piCur[2];
1349
6.50M
    diff[k + 3] = piOrg[3] - piCur[3];
1350
6.50M
    diff[k + 4] = piOrg[4] - piCur[4];
1351
6.50M
    diff[k + 5] = piOrg[5] - piCur[5];
1352
6.50M
    diff[k + 6] = piOrg[6] - piCur[6];
1353
6.50M
    diff[k + 7] = piOrg[7] - piCur[7];
1354
1355
6.50M
    diff[k + 8] = piOrg[8] - piCur[8];
1356
6.50M
    diff[k + 9] = piOrg[9] - piCur[9];
1357
6.50M
    diff[k + 10] = piOrg[10] - piCur[10];
1358
6.50M
    diff[k + 11] = piOrg[11] - piCur[11];
1359
6.50M
    diff[k + 12] = piOrg[12] - piCur[12];
1360
6.50M
    diff[k + 13] = piOrg[13] - piCur[13];
1361
6.50M
    diff[k + 14] = piOrg[14] - piCur[14];
1362
6.50M
    diff[k + 15] = piOrg[15] - piCur[15];
1363
1364
6.50M
    piCur += iStrideCur;
1365
6.50M
    piOrg += iStrideOrg;
1366
6.50M
  }
1367
1368
  //horizontal
1369
7.31M
  for( j = 0; j < 8; j++ )
1370
6.50M
  {
1371
6.50M
    jj = j << 4;
1372
1373
6.50M
    m2[j][0] = diff[jj    ] + diff[jj + 8];
1374
6.50M
    m2[j][1] = diff[jj + 1] + diff[jj + 9];
1375
6.50M
    m2[j][2] = diff[jj + 2] + diff[jj + 10];
1376
6.50M
    m2[j][3] = diff[jj + 3] + diff[jj + 11];
1377
6.50M
    m2[j][4] = diff[jj + 4] + diff[jj + 12];
1378
6.50M
    m2[j][5] = diff[jj + 5] + diff[jj + 13];
1379
6.50M
    m2[j][6] = diff[jj + 6] + diff[jj + 14];
1380
6.50M
    m2[j][7] = diff[jj + 7] + diff[jj + 15];
1381
6.50M
    m2[j][8] = diff[jj    ] - diff[jj + 8];
1382
6.50M
    m2[j][9] = diff[jj + 1] - diff[jj + 9];
1383
6.50M
    m2[j][10] = diff[jj + 2] - diff[jj + 10];
1384
6.50M
    m2[j][11] = diff[jj + 3] - diff[jj + 11];
1385
6.50M
    m2[j][12] = diff[jj + 4] - diff[jj + 12];
1386
6.50M
    m2[j][13] = diff[jj + 5] - diff[jj + 13];
1387
6.50M
    m2[j][14] = diff[jj + 6] - diff[jj + 14];
1388
6.50M
    m2[j][15] = diff[jj + 7] - diff[jj + 15];
1389
1390
6.50M
    m1[j][0] = m2[j][0] + m2[j][4];
1391
6.50M
    m1[j][1] = m2[j][1] + m2[j][5];
1392
6.50M
    m1[j][2] = m2[j][2] + m2[j][6];
1393
6.50M
    m1[j][3] = m2[j][3] + m2[j][7];
1394
6.50M
    m1[j][4] = m2[j][0] - m2[j][4];
1395
6.50M
    m1[j][5] = m2[j][1] - m2[j][5];
1396
6.50M
    m1[j][6] = m2[j][2] - m2[j][6];
1397
6.50M
    m1[j][7] = m2[j][3] - m2[j][7];
1398
6.50M
    m1[j][8] = m2[j][8] + m2[j][12];
1399
6.50M
    m1[j][9] = m2[j][9] + m2[j][13];
1400
6.50M
    m1[j][10] = m2[j][10] + m2[j][14];
1401
6.50M
    m1[j][11] = m2[j][11] + m2[j][15];
1402
6.50M
    m1[j][12] = m2[j][8] - m2[j][12];
1403
6.50M
    m1[j][13] = m2[j][9] - m2[j][13];
1404
6.50M
    m1[j][14] = m2[j][10] - m2[j][14];
1405
6.50M
    m1[j][15] = m2[j][11] - m2[j][15];
1406
1407
6.50M
    m2[j][0] = m1[j][0] + m1[j][2];
1408
6.50M
    m2[j][1] = m1[j][1] + m1[j][3];
1409
6.50M
    m2[j][2] = m1[j][0] - m1[j][2];
1410
6.50M
    m2[j][3] = m1[j][1] - m1[j][3];
1411
6.50M
    m2[j][4] = m1[j][4] + m1[j][6];
1412
6.50M
    m2[j][5] = m1[j][5] + m1[j][7];
1413
6.50M
    m2[j][6] = m1[j][4] - m1[j][6];
1414
6.50M
    m2[j][7] = m1[j][5] - m1[j][7];
1415
6.50M
    m2[j][8] = m1[j][8] + m1[j][10];
1416
6.50M
    m2[j][9] = m1[j][9] + m1[j][11];
1417
6.50M
    m2[j][10] = m1[j][8] - m1[j][10];
1418
6.50M
    m2[j][11] = m1[j][9] - m1[j][11];
1419
6.50M
    m2[j][12] = m1[j][12] + m1[j][14];
1420
6.50M
    m2[j][13] = m1[j][13] + m1[j][15];
1421
6.50M
    m2[j][14] = m1[j][12] - m1[j][14];
1422
6.50M
    m2[j][15] = m1[j][13] - m1[j][15];
1423
1424
6.50M
    m1[j][0] = m2[j][0] + m2[j][1];
1425
6.50M
    m1[j][1] = m2[j][0] - m2[j][1];
1426
6.50M
    m1[j][2] = m2[j][2] + m2[j][3];
1427
6.50M
    m1[j][3] = m2[j][2] - m2[j][3];
1428
6.50M
    m1[j][4] = m2[j][4] + m2[j][5];
1429
6.50M
    m1[j][5] = m2[j][4] - m2[j][5];
1430
6.50M
    m1[j][6] = m2[j][6] + m2[j][7];
1431
6.50M
    m1[j][7] = m2[j][6] - m2[j][7];
1432
6.50M
    m1[j][8] = m2[j][8] + m2[j][9];
1433
6.50M
    m1[j][9] = m2[j][8] - m2[j][9];
1434
6.50M
    m1[j][10] = m2[j][10] + m2[j][11];
1435
6.50M
    m1[j][11] = m2[j][10] - m2[j][11];
1436
6.50M
    m1[j][12] = m2[j][12] + m2[j][13];
1437
6.50M
    m1[j][13] = m2[j][12] - m2[j][13];
1438
6.50M
    m1[j][14] = m2[j][14] + m2[j][15];
1439
6.50M
    m1[j][15] = m2[j][14] - m2[j][15];
1440
6.50M
  }
1441
1442
  //vertical
1443
13.8M
  for( i = 0; i < 16; i++ )
1444
13.0M
  {
1445
13.0M
    m2[0][i] = m1[0][i] + m1[4][i];
1446
13.0M
    m2[1][i] = m1[1][i] + m1[5][i];
1447
13.0M
    m2[2][i] = m1[2][i] + m1[6][i];
1448
13.0M
    m2[3][i] = m1[3][i] + m1[7][i];
1449
13.0M
    m2[4][i] = m1[0][i] - m1[4][i];
1450
13.0M
    m2[5][i] = m1[1][i] - m1[5][i];
1451
13.0M
    m2[6][i] = m1[2][i] - m1[6][i];
1452
13.0M
    m2[7][i] = m1[3][i] - m1[7][i];
1453
1454
13.0M
    m1[0][i] = m2[0][i] + m2[2][i];
1455
13.0M
    m1[1][i] = m2[1][i] + m2[3][i];
1456
13.0M
    m1[2][i] = m2[0][i] - m2[2][i];
1457
13.0M
    m1[3][i] = m2[1][i] - m2[3][i];
1458
13.0M
    m1[4][i] = m2[4][i] + m2[6][i];
1459
13.0M
    m1[5][i] = m2[5][i] + m2[7][i];
1460
13.0M
    m1[6][i] = m2[4][i] - m2[6][i];
1461
13.0M
    m1[7][i] = m2[5][i] - m2[7][i];
1462
1463
13.0M
    m2[0][i] = m1[0][i] + m1[1][i];
1464
13.0M
    m2[1][i] = m1[0][i] - m1[1][i];
1465
13.0M
    m2[2][i] = m1[2][i] + m1[3][i];
1466
13.0M
    m2[3][i] = m1[2][i] - m1[3][i];
1467
13.0M
    m2[4][i] = m1[4][i] + m1[5][i];
1468
13.0M
    m2[5][i] = m1[4][i] - m1[5][i];
1469
13.0M
    m2[6][i] = m1[6][i] + m1[7][i];
1470
13.0M
    m2[7][i] = m1[6][i] - m1[7][i];
1471
13.0M
  }
1472
1473
7.31M
  for( i = 0; i < 8; i++ )
1474
6.50M
  {
1475
110M
    for( j = 0; j < 16; j++ )
1476
104M
    {
1477
104M
      sad += abs( m2[i][j] );
1478
104M
    }
1479
6.50M
  }
1480
  
1481
813k
  sad -= abs( m2[0][0] );
1482
813k
  sad += abs( m2[0][0] ) >> 2;
1483
813k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1484
1485
813k
  return sad;
1486
813k
}
1487
1488
static Distortion xCalcHADs8x16( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1489
826k
{
1490
826k
  int k, i, j, jj, sad = 0;
1491
826k
  int diff[128], m1[16][8], m2[16][8];
1492
14.0M
  for( k = 0; k < 128; k += 8 )
1493
13.2M
  {
1494
13.2M
    diff[k + 0] = piOrg[0] - piCur[0];
1495
13.2M
    diff[k + 1] = piOrg[1] - piCur[1];
1496
13.2M
    diff[k + 2] = piOrg[2] - piCur[2];
1497
13.2M
    diff[k + 3] = piOrg[3] - piCur[3];
1498
13.2M
    diff[k + 4] = piOrg[4] - piCur[4];
1499
13.2M
    diff[k + 5] = piOrg[5] - piCur[5];
1500
13.2M
    diff[k + 6] = piOrg[6] - piCur[6];
1501
13.2M
    diff[k + 7] = piOrg[7] - piCur[7];
1502
1503
13.2M
    piCur += iStrideCur;
1504
13.2M
    piOrg += iStrideOrg;
1505
13.2M
  }
1506
1507
  //horizontal
1508
14.0M
  for( j = 0; j < 16; j++ )
1509
13.2M
  {
1510
13.2M
    jj = j << 3;
1511
1512
13.2M
    m2[j][0] = diff[jj] + diff[jj + 4];
1513
13.2M
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1514
13.2M
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1515
13.2M
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1516
13.2M
    m2[j][4] = diff[jj] - diff[jj + 4];
1517
13.2M
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1518
13.2M
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1519
13.2M
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1520
1521
13.2M
    m1[j][0] = m2[j][0] + m2[j][2];
1522
13.2M
    m1[j][1] = m2[j][1] + m2[j][3];
1523
13.2M
    m1[j][2] = m2[j][0] - m2[j][2];
1524
13.2M
    m1[j][3] = m2[j][1] - m2[j][3];
1525
13.2M
    m1[j][4] = m2[j][4] + m2[j][6];
1526
13.2M
    m1[j][5] = m2[j][5] + m2[j][7];
1527
13.2M
    m1[j][6] = m2[j][4] - m2[j][6];
1528
13.2M
    m1[j][7] = m2[j][5] - m2[j][7];
1529
1530
13.2M
    m2[j][0] = m1[j][0] + m1[j][1];
1531
13.2M
    m2[j][1] = m1[j][0] - m1[j][1];
1532
13.2M
    m2[j][2] = m1[j][2] + m1[j][3];
1533
13.2M
    m2[j][3] = m1[j][2] - m1[j][3];
1534
13.2M
    m2[j][4] = m1[j][4] + m1[j][5];
1535
13.2M
    m2[j][5] = m1[j][4] - m1[j][5];
1536
13.2M
    m2[j][6] = m1[j][6] + m1[j][7];
1537
13.2M
    m2[j][7] = m1[j][6] - m1[j][7];
1538
13.2M
  }
1539
1540
  //vertical
1541
7.43M
  for( i = 0; i < 8; i++ )
1542
6.60M
  {
1543
6.60M
    m1[0][i] = m2[0][i] + m2[8][i];
1544
6.60M
    m1[1][i] = m2[1][i] + m2[9][i];
1545
6.60M
    m1[2][i] = m2[2][i] + m2[10][i];
1546
6.60M
    m1[3][i] = m2[3][i] + m2[11][i];
1547
6.60M
    m1[4][i] = m2[4][i] + m2[12][i];
1548
6.60M
    m1[5][i] = m2[5][i] + m2[13][i];
1549
6.60M
    m1[6][i] = m2[6][i] + m2[14][i];
1550
6.60M
    m1[7][i] = m2[7][i] + m2[15][i];
1551
6.60M
    m1[8][i] = m2[0][i] - m2[8][i];
1552
6.60M
    m1[9][i] = m2[1][i] - m2[9][i];
1553
6.60M
    m1[10][i] = m2[2][i] - m2[10][i];
1554
6.60M
    m1[11][i] = m2[3][i] - m2[11][i];
1555
6.60M
    m1[12][i] = m2[4][i] - m2[12][i];
1556
6.60M
    m1[13][i] = m2[5][i] - m2[13][i];
1557
6.60M
    m1[14][i] = m2[6][i] - m2[14][i];
1558
6.60M
    m1[15][i] = m2[7][i] - m2[15][i];
1559
1560
6.60M
    m2[0][i] = m1[0][i] + m1[4][i];
1561
6.60M
    m2[1][i] = m1[1][i] + m1[5][i];
1562
6.60M
    m2[2][i] = m1[2][i] + m1[6][i];
1563
6.60M
    m2[3][i] = m1[3][i] + m1[7][i];
1564
6.60M
    m2[4][i] = m1[0][i] - m1[4][i];
1565
6.60M
    m2[5][i] = m1[1][i] - m1[5][i];
1566
6.60M
    m2[6][i] = m1[2][i] - m1[6][i];
1567
6.60M
    m2[7][i] = m1[3][i] - m1[7][i];
1568
6.60M
    m2[8][i] = m1[8][i] + m1[12][i];
1569
6.60M
    m2[9][i] = m1[9][i] + m1[13][i];
1570
6.60M
    m2[10][i] = m1[10][i] + m1[14][i];
1571
6.60M
    m2[11][i] = m1[11][i] + m1[15][i];
1572
6.60M
    m2[12][i] = m1[8][i] - m1[12][i];
1573
6.60M
    m2[13][i] = m1[9][i] - m1[13][i];
1574
6.60M
    m2[14][i] = m1[10][i] - m1[14][i];
1575
6.60M
    m2[15][i] = m1[11][i] - m1[15][i];
1576
1577
6.60M
    m1[0][i] = m2[0][i] + m2[2][i];
1578
6.60M
    m1[1][i] = m2[1][i] + m2[3][i];
1579
6.60M
    m1[2][i] = m2[0][i] - m2[2][i];
1580
6.60M
    m1[3][i] = m2[1][i] - m2[3][i];
1581
6.60M
    m1[4][i] = m2[4][i] + m2[6][i];
1582
6.60M
    m1[5][i] = m2[5][i] + m2[7][i];
1583
6.60M
    m1[6][i] = m2[4][i] - m2[6][i];
1584
6.60M
    m1[7][i] = m2[5][i] - m2[7][i];
1585
6.60M
    m1[8][i] = m2[8][i] + m2[10][i];
1586
6.60M
    m1[9][i] = m2[9][i] + m2[11][i];
1587
6.60M
    m1[10][i] = m2[8][i] - m2[10][i];
1588
6.60M
    m1[11][i] = m2[9][i] - m2[11][i];
1589
6.60M
    m1[12][i] = m2[12][i] + m2[14][i];
1590
6.60M
    m1[13][i] = m2[13][i] + m2[15][i];
1591
6.60M
    m1[14][i] = m2[12][i] - m2[14][i];
1592
6.60M
    m1[15][i] = m2[13][i] - m2[15][i];
1593
1594
6.60M
    m2[0][i] = m1[0][i] + m1[1][i];
1595
6.60M
    m2[1][i] = m1[0][i] - m1[1][i];
1596
6.60M
    m2[2][i] = m1[2][i] + m1[3][i];
1597
6.60M
    m2[3][i] = m1[2][i] - m1[3][i];
1598
6.60M
    m2[4][i] = m1[4][i] + m1[5][i];
1599
6.60M
    m2[5][i] = m1[4][i] - m1[5][i];
1600
6.60M
    m2[6][i] = m1[6][i] + m1[7][i];
1601
6.60M
    m2[7][i] = m1[6][i] - m1[7][i];
1602
6.60M
    m2[8][i] = m1[8][i] + m1[9][i];
1603
6.60M
    m2[9][i] = m1[8][i] - m1[9][i];
1604
6.60M
    m2[10][i] = m1[10][i] + m1[11][i];
1605
6.60M
    m2[11][i] = m1[10][i] - m1[11][i];
1606
6.60M
    m2[12][i] = m1[12][i] + m1[13][i];
1607
6.60M
    m2[13][i] = m1[12][i] - m1[13][i];
1608
6.60M
    m2[14][i] = m1[14][i] + m1[15][i];
1609
6.60M
    m2[15][i] = m1[14][i] - m1[15][i];
1610
6.60M
  }
1611
1612
14.0M
  for( i = 0; i < 16; i++ )
1613
13.2M
  {
1614
118M
    for( j = 0; j < 8; j++ )
1615
105M
    {
1616
105M
      sad += abs( m2[i][j] );
1617
105M
    }
1618
13.2M
  }
1619
  
1620
826k
  sad -= abs( m2[0][0] );
1621
826k
  sad += abs( m2[0][0] ) >> 2;
1622
826k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1623
1624
826k
  return sad;
1625
826k
}
1626
1627
static Distortion xCalcHADs4x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1628
149k
{
1629
149k
  int k, i, j, jj, sad = 0;
1630
149k
  int diff[32], m1[8][4], m2[8][4];
1631
1.34M
  for( k = 0; k < 32; k += 4 )
1632
1.19M
  {
1633
1.19M
    diff[k + 0] = piOrg[0] - piCur[0];
1634
1.19M
    diff[k + 1] = piOrg[1] - piCur[1];
1635
1.19M
    diff[k + 2] = piOrg[2] - piCur[2];
1636
1.19M
    diff[k + 3] = piOrg[3] - piCur[3];
1637
1638
1.19M
    piCur += iStrideCur;
1639
1.19M
    piOrg += iStrideOrg;
1640
1.19M
  }
1641
1642
  //horizontal
1643
1.34M
  for( j = 0; j < 8; j++ )
1644
1.19M
  {
1645
1.19M
    jj = j << 2;
1646
1.19M
    m2[j][0] = diff[jj] + diff[jj + 2];
1647
1.19M
    m2[j][1] = diff[jj + 1] + diff[jj + 3];
1648
1.19M
    m2[j][2] = diff[jj] - diff[jj + 2];
1649
1.19M
    m2[j][3] = diff[jj + 1] - diff[jj + 3];
1650
1651
1.19M
    m1[j][0] = m2[j][0] + m2[j][1];
1652
1.19M
    m1[j][1] = m2[j][0] - m2[j][1];
1653
1.19M
    m1[j][2] = m2[j][2] + m2[j][3];
1654
1.19M
    m1[j][3] = m2[j][2] - m2[j][3];
1655
1.19M
  }
1656
1657
  //vertical
1658
747k
  for( i = 0; i < 4; i++ )
1659
598k
  {
1660
598k
    m2[0][i] = m1[0][i] + m1[4][i];
1661
598k
    m2[1][i] = m1[1][i] + m1[5][i];
1662
598k
    m2[2][i] = m1[2][i] + m1[6][i];
1663
598k
    m2[3][i] = m1[3][i] + m1[7][i];
1664
598k
    m2[4][i] = m1[0][i] - m1[4][i];
1665
598k
    m2[5][i] = m1[1][i] - m1[5][i];
1666
598k
    m2[6][i] = m1[2][i] - m1[6][i];
1667
598k
    m2[7][i] = m1[3][i] - m1[7][i];
1668
1669
598k
    m1[0][i] = m2[0][i] + m2[2][i];
1670
598k
    m1[1][i] = m2[1][i] + m2[3][i];
1671
598k
    m1[2][i] = m2[0][i] - m2[2][i];
1672
598k
    m1[3][i] = m2[1][i] - m2[3][i];
1673
598k
    m1[4][i] = m2[4][i] + m2[6][i];
1674
598k
    m1[5][i] = m2[5][i] + m2[7][i];
1675
598k
    m1[6][i] = m2[4][i] - m2[6][i];
1676
598k
    m1[7][i] = m2[5][i] - m2[7][i];
1677
1678
598k
    m2[0][i] = m1[0][i] + m1[1][i];
1679
598k
    m2[1][i] = m1[0][i] - m1[1][i];
1680
598k
    m2[2][i] = m1[2][i] + m1[3][i];
1681
598k
    m2[3][i] = m1[2][i] - m1[3][i];
1682
598k
    m2[4][i] = m1[4][i] + m1[5][i];
1683
598k
    m2[5][i] = m1[4][i] - m1[5][i];
1684
598k
    m2[6][i] = m1[6][i] + m1[7][i];
1685
598k
    m2[7][i] = m1[6][i] - m1[7][i];
1686
598k
  }
1687
1688
1.34M
  for( i = 0; i < 8; i++ )
1689
1.19M
  {
1690
5.98M
    for( j = 0; j < 4; j++ )
1691
4.78M
    {
1692
4.78M
      sad += abs( m2[i][j] );
1693
4.78M
    }
1694
1.19M
  }
1695
  
1696
149k
  sad -= abs( m2[0][0] );
1697
149k
  sad += abs( m2[0][0] ) >> 2;
1698
149k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1699
1700
149k
  return sad;
1701
149k
}
1702
1703
static Distortion xCalcHADs8x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1704
158k
{
1705
158k
  int k, i, j, jj, sad = 0;
1706
158k
  int diff[32], m1[4][8], m2[4][8];
1707
792k
  for( k = 0; k < 32; k += 8 )
1708
633k
  {
1709
633k
    diff[k + 0] = piOrg[0] - piCur[0];
1710
633k
    diff[k + 1] = piOrg[1] - piCur[1];
1711
633k
    diff[k + 2] = piOrg[2] - piCur[2];
1712
633k
    diff[k + 3] = piOrg[3] - piCur[3];
1713
633k
    diff[k + 4] = piOrg[4] - piCur[4];
1714
633k
    diff[k + 5] = piOrg[5] - piCur[5];
1715
633k
    diff[k + 6] = piOrg[6] - piCur[6];
1716
633k
    diff[k + 7] = piOrg[7] - piCur[7];
1717
1718
633k
    piCur += iStrideCur;
1719
633k
    piOrg += iStrideOrg;
1720
633k
  }
1721
1722
  //horizontal
1723
792k
  for( j = 0; j < 4; j++ )
1724
633k
  {
1725
633k
    jj = j << 3;
1726
1727
633k
    m2[j][0] = diff[jj] + diff[jj + 4];
1728
633k
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1729
633k
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1730
633k
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1731
633k
    m2[j][4] = diff[jj] - diff[jj + 4];
1732
633k
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1733
633k
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1734
633k
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1735
1736
633k
    m1[j][0] = m2[j][0] + m2[j][2];
1737
633k
    m1[j][1] = m2[j][1] + m2[j][3];
1738
633k
    m1[j][2] = m2[j][0] - m2[j][2];
1739
633k
    m1[j][3] = m2[j][1] - m2[j][3];
1740
633k
    m1[j][4] = m2[j][4] + m2[j][6];
1741
633k
    m1[j][5] = m2[j][5] + m2[j][7];
1742
633k
    m1[j][6] = m2[j][4] - m2[j][6];
1743
633k
    m1[j][7] = m2[j][5] - m2[j][7];
1744
1745
633k
    m2[j][0] = m1[j][0] + m1[j][1];
1746
633k
    m2[j][1] = m1[j][0] - m1[j][1];
1747
633k
    m2[j][2] = m1[j][2] + m1[j][3];
1748
633k
    m2[j][3] = m1[j][2] - m1[j][3];
1749
633k
    m2[j][4] = m1[j][4] + m1[j][5];
1750
633k
    m2[j][5] = m1[j][4] - m1[j][5];
1751
633k
    m2[j][6] = m1[j][6] + m1[j][7];
1752
633k
    m2[j][7] = m1[j][6] - m1[j][7];
1753
633k
  }
1754
1755
  //vertical
1756
1.42M
  for( i = 0; i < 8; i++ )
1757
1.26M
  {
1758
1.26M
    m1[0][i] = m2[0][i] + m2[2][i];
1759
1.26M
    m1[1][i] = m2[1][i] + m2[3][i];
1760
1.26M
    m1[2][i] = m2[0][i] - m2[2][i];
1761
1.26M
    m1[3][i] = m2[1][i] - m2[3][i];
1762
1763
1.26M
    m2[0][i] = m1[0][i] + m1[1][i];
1764
1.26M
    m2[1][i] = m1[0][i] - m1[1][i];
1765
1.26M
    m2[2][i] = m1[2][i] + m1[3][i];
1766
1.26M
    m2[3][i] = m1[2][i] - m1[3][i];
1767
1.26M
  }
1768
1769
792k
  for( i = 0; i < 4; i++ )
1770
633k
  {
1771
5.70M
    for( j = 0; j < 8; j++ )
1772
5.07M
    {
1773
5.07M
      sad += abs( m2[i][j] );
1774
5.07M
    }
1775
633k
  }
1776
  
1777
158k
  sad -= abs( m2[0][0] );
1778
158k
  sad += abs( m2[0][0] ) >> 2;
1779
158k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1780
1781
158k
  return sad;
1782
158k
}
1783
1784
Distortion RdCost::xGetHAD2SADs( const DistParam &rcDtParam )
1785
849k
{
1786
849k
  if( rcDtParam.applyWeight )
1787
0
  {
1788
0
    THROW(" no support");
1789
0
  }
1790
1791
849k
  Distortion distHad = xGetHADs<false>( rcDtParam );
1792
849k
  Distortion distSad = 0;
1793
849k
  {
1794
849k
    CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering");
1795
1796
849k
    const Pel* piOrg  = rcDtParam.org.buf;
1797
849k
    const Pel* piCur  = rcDtParam.cur.buf;
1798
849k
    int  iRows        = rcDtParam.org.height>>2;
1799
849k
    int  iCols        = rcDtParam.org.width<<2;
1800
1801
849k
    Distortion uiSum = 0;
1802
1803
7.33M
    for( int y = 0; y < iRows;  y++ )
1804
6.48M
    {
1805
69.2M
      for (int n = 0; n < iCols; n+=16 )
1806
62.8M
      {
1807
62.8M
        uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
1808
62.8M
        uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
1809
62.8M
        uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
1810
62.8M
        uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
1811
62.8M
        uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
1812
62.8M
        uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
1813
62.8M
        uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
1814
62.8M
        uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
1815
62.8M
        uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
1816
62.8M
        uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
1817
62.8M
        uiSum += abs( piOrg[n+10] - piCur[n+10] );
1818
62.8M
        uiSum += abs( piOrg[n+11] - piCur[n+11] );
1819
62.8M
        uiSum += abs( piOrg[n+12] - piCur[n+12] );
1820
62.8M
        uiSum += abs( piOrg[n+13] - piCur[n+13] );
1821
62.8M
        uiSum += abs( piOrg[n+14] - piCur[n+14] );
1822
62.8M
        uiSum += abs( piOrg[n+15] - piCur[n+15] );
1823
62.8M
      }
1824
6.48M
      piOrg += iCols;
1825
6.48M
      piCur += iCols;
1826
6.48M
    }
1827
1828
849k
    distSad = (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1829
849k
  }
1830
1831
0
  return std::min( distHad, 2*distSad);
1832
849k
}
1833
1834
template<bool fastHad>
1835
Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
1836
1.40M
{
1837
1.40M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.40M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.40M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.40M
  const int  iRows = rcDtParam.org.height;
1844
1.40M
  const int  iCols = rcDtParam.org.width;
1845
1.40M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.40M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.40M
  int  x = 0, y = 0;
1849
1850
1.40M
  Distortion uiSum = 0;
1851
1852
1.40M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
301k
  {
1854
741k
    for( y = 0; y < iRows; y += 8 )
1855
440k
    {
1856
1.25M
      for( x = 0; x < iCols; x += 16 )
1857
813k
      {
1858
813k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
813k
      }
1860
440k
      piOrg += iStrideOrg * 8;
1861
440k
      piCur += iStrideCur * 8;
1862
440k
    }
1863
301k
  }
1864
1.10M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
300k
  {
1866
842k
    for( y = 0; y < iRows; y += 16 )
1867
541k
    {
1868
1.36M
      for( x = 0; x < iCols; x += 8 )
1869
826k
      {
1870
826k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
826k
      }
1872
541k
      piOrg += iStrideOrg * 16;
1873
541k
      piCur += iStrideCur * 16;
1874
541k
    }
1875
300k
  }
1876
801k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
85.8k
  {
1878
171k
    for( y = 0; y < iRows; y += 4 )
1879
85.8k
    {
1880
244k
      for( x = 0; x < iCols; x += 8 )
1881
158k
      {
1882
158k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
158k
      }
1884
85.8k
      piOrg += iStrideOrg * 4;
1885
85.8k
      piCur += iStrideCur * 4;
1886
85.8k
    }
1887
85.8k
  }
1888
715k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
79.4k
  {
1890
228k
    for( y = 0; y < iRows; y += 8 )
1891
149k
    {
1892
299k
      for( x = 0; x < iCols; x += 4 )
1893
149k
      {
1894
149k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
149k
      }
1896
149k
      piOrg += iStrideOrg * 8;
1897
149k
      piCur += iStrideCur * 8;
1898
149k
    }
1899
79.4k
  }
1900
635k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
635k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
567k
  {
1914
2.98M
    for( y = 0; y < iRows; y += 8 )
1915
2.41M
    {
1916
16.5M
      for( x = 0; x < iCols; x += 8 )
1917
14.1M
      {
1918
14.1M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
14.1M
      }
1920
2.41M
      piOrg += 8*iStrideOrg;
1921
2.41M
      piCur += 8*iStrideCur;
1922
2.41M
    }
1923
567k
  }
1924
68.2k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
41.1k
  {
1926
82.3k
    for( y = 0; y < iRows; y += 4 )
1927
41.1k
    {
1928
82.3k
      for( x = 0; x < iCols; x += 4 )
1929
41.1k
      {
1930
41.1k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
41.1k
      }
1932
41.1k
      piOrg += 4*iStrideOrg;
1933
41.1k
      piCur += 4*iStrideCur;
1934
41.1k
    }
1935
41.1k
  }
1936
27.1k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
27.1k
  {
1938
54.2k
    for( y = 0; y < iRows; y += 2 )
1939
27.1k
    {
1940
177k
      for( x = 0; x < iCols; x += 2 )
1941
150k
      {
1942
150k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
150k
      }
1944
27.1k
      piOrg += 2*iStrideOrg;
1945
27.1k
      piCur += 2*iStrideCur;
1946
27.1k
    }
1947
27.1k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.40M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.40M
}
unsigned long vvenc::RdCost::xGetHADs<false>(vvenc::DistParam const&)
Line
Count
Source
1836
1.40M
{
1837
1.40M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.40M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.40M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.40M
  const int  iRows = rcDtParam.org.height;
1844
1.40M
  const int  iCols = rcDtParam.org.width;
1845
1.40M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.40M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.40M
  int  x = 0, y = 0;
1849
1850
1.40M
  Distortion uiSum = 0;
1851
1852
1.40M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
301k
  {
1854
741k
    for( y = 0; y < iRows; y += 8 )
1855
440k
    {
1856
1.25M
      for( x = 0; x < iCols; x += 16 )
1857
813k
      {
1858
813k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
813k
      }
1860
440k
      piOrg += iStrideOrg * 8;
1861
440k
      piCur += iStrideCur * 8;
1862
440k
    }
1863
301k
  }
1864
1.10M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
300k
  {
1866
842k
    for( y = 0; y < iRows; y += 16 )
1867
541k
    {
1868
1.36M
      for( x = 0; x < iCols; x += 8 )
1869
826k
      {
1870
826k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
826k
      }
1872
541k
      piOrg += iStrideOrg * 16;
1873
541k
      piCur += iStrideCur * 16;
1874
541k
    }
1875
300k
  }
1876
801k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
85.8k
  {
1878
171k
    for( y = 0; y < iRows; y += 4 )
1879
85.8k
    {
1880
244k
      for( x = 0; x < iCols; x += 8 )
1881
158k
      {
1882
158k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
158k
      }
1884
85.8k
      piOrg += iStrideOrg * 4;
1885
85.8k
      piCur += iStrideCur * 4;
1886
85.8k
    }
1887
85.8k
  }
1888
715k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
79.4k
  {
1890
228k
    for( y = 0; y < iRows; y += 8 )
1891
149k
    {
1892
299k
      for( x = 0; x < iCols; x += 4 )
1893
149k
      {
1894
149k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
149k
      }
1896
149k
      piOrg += iStrideOrg * 8;
1897
149k
      piCur += iStrideCur * 8;
1898
149k
    }
1899
79.4k
  }
1900
635k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
635k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
567k
  {
1914
2.98M
    for( y = 0; y < iRows; y += 8 )
1915
2.41M
    {
1916
16.5M
      for( x = 0; x < iCols; x += 8 )
1917
14.1M
      {
1918
14.1M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
14.1M
      }
1920
2.41M
      piOrg += 8*iStrideOrg;
1921
2.41M
      piCur += 8*iStrideCur;
1922
2.41M
    }
1923
567k
  }
1924
68.2k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
41.1k
  {
1926
82.3k
    for( y = 0; y < iRows; y += 4 )
1927
41.1k
    {
1928
82.3k
      for( x = 0; x < iCols; x += 4 )
1929
41.1k
      {
1930
41.1k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
41.1k
      }
1932
41.1k
      piOrg += 4*iStrideOrg;
1933
41.1k
      piCur += 4*iStrideCur;
1934
41.1k
    }
1935
41.1k
  }
1936
27.1k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
27.1k
  {
1938
54.2k
    for( y = 0; y < iRows; y += 2 )
1939
27.1k
    {
1940
177k
      for( x = 0; x < iCols; x += 2 )
1941
150k
      {
1942
150k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
150k
      }
1944
27.1k
      piOrg += 2*iStrideOrg;
1945
27.1k
      piCur += 2*iStrideCur;
1946
27.1k
    }
1947
27.1k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.40M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.40M
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs<true>(vvenc::DistParam const&)
1955
1956
1957
void RdCost::saveUnadjustedLambda()
1958
10.9k
{
1959
10.9k
  m_dLambda_unadjusted = m_dLambda;
1960
10.9k
  m_DistScaleUnadjusted = m_DistScale;
1961
10.9k
}
1962
1963
1964
inline Distortion getWeightedMSE(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift)
1965
0
{
1966
0
  const Intermediate_Int iTemp = org - cur;
1967
0
  return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift);
1968
0
}
1969
1970
template<int csx>
1971
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights )
1972
0
{
1973
0
        int  iRows = rcDtParam.org.height;
1974
0
  const Pel* piOrg = rcDtParam.org.buf;
1975
0
  const Pel* piCur = rcDtParam.cur.buf;
1976
0
  const int  iCols = rcDtParam.org.width;
1977
0
  const int  iStrideCur = rcDtParam.cur.stride;
1978
0
  const int  iStrideOrg = rcDtParam.org.stride;
1979
0
  const Pel* piOrgLuma        = rcDtParam.orgLuma->buf;
1980
0
  const int  iStrideOrgLuma   = rcDtParam.orgLuma->stride;
1981
1982
0
  Distortion uiSum   = 0;
1983
0
  uint32_t uiShift   = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
1984
1985
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
1986
0
  const int cf =  1 - ( iCols & 1 );
1987
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
1988
0
  const ComponentID compId = rcDtParam.compID;
1989
0
  const size_t  cShiftY    = getComponentScaleY(compId, chmFmt);
1990
1991
0
  for( ; iRows != 0; iRows-- )
1992
0
  {
1993
0
    for (int n = 0; n < iCols; n+=2 )
1994
0
    {
1995
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], lumaWeights[piOrgLuma[(n   )<<csx]], uiShift );
1996
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], lumaWeights[piOrgLuma[(n+cf)<<csx]], uiShift );
1997
0
    }
1998
1999
0
    piOrg     += iStrideOrg;
2000
0
    piCur     += iStrideCur;
2001
0
    piOrgLuma += iStrideOrgLuma<<cShiftY;
2002
0
  }
2003
2004
0
  return ( uiSum >> ( 1 - cf ) );
2005
0
}
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
2006
2007
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedPTweight )
2008
0
{
2009
0
        int  iRows = rcDtParam.org.height;
2010
0
  const Pel* piOrg = rcDtParam.org.buf;
2011
0
  const Pel* piCur = rcDtParam.cur.buf;
2012
0
  const int  iCols = rcDtParam.org.width;
2013
0
  const int  iStrideCur = rcDtParam.cur.stride;
2014
0
  const int  iStrideOrg = rcDtParam.org.stride;
2015
2016
0
  Distortion uiSum   = 0;
2017
0
  uint32_t uiShift = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
2018
2019
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
2020
0
  const int cf =  1 - ( iCols & 1 );
2021
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
2022
  
2023
0
  for( ; iRows != 0; iRows-- )
2024
0
  {
2025
0
    for (int n = 0; n < iCols; n+=2 )
2026
0
    {
2027
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], fixedPTweight, uiShift );
2028
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], fixedPTweight, uiShift );
2029
0
    }
2030
0
    piOrg += iStrideOrg;
2031
0
    piCur += iStrideCur;
2032
0
  }
2033
2034
0
  return ( uiSum >> ( 1 - cf ) );
2035
0
}
2036
2037
Distortion RdCost::xGetSSE_WTD( const DistParam &rcDtParam ) const
2038
0
{
2039
0
  if( rcDtParam.applyWeight )
2040
0
  {
2041
0
    THROW("no support");
2042
0
  }
2043
2044
0
  if ((m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG) && rcDtParam.compID != COMP_Y)
2045
0
  {
2046
0
    const uint32_t fixedPTweight = ( uint32_t ) ( m_chromaWeight * ( double ) ( 1 << 16 ) );
2047
2048
0
    return m_fxdWtdPredPtr( rcDtParam, fixedPTweight );
2049
0
  }
2050
0
  else
2051
0
  {
2052
0
    return m_wtdPredPtr[getComponentScaleX(rcDtParam.compID, m_cf)]( rcDtParam, m_cf, m_reshapeLumaLevelToWeightPLUT );
2053
0
  }
2054
2055
0
  return 0;
2056
0
}
2057
2058
0
void RdCost::xGetSAD8X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2059
0
  DistParam rcDtParamTmp0 = rcDtParam;
2060
2061
0
  DistParam rcDtParamTmp1 = rcDtParam;
2062
0
  rcDtParamTmp1.org.buf += 1;
2063
0
  rcDtParamTmp1.cur.buf -= 1;
2064
2065
0
  DistParam rcDtParamTmp2 = rcDtParam;
2066
0
  rcDtParamTmp2.org.buf += 2;
2067
0
  rcDtParamTmp2.cur.buf -= 2;
2068
2069
0
  DistParam rcDtParamTmp3 = rcDtParam;
2070
0
  rcDtParamTmp3.org.buf += 3;
2071
0
  rcDtParamTmp3.cur.buf -= 3;
2072
2073
0
  DistParam rcDtParamTmp4 = rcDtParam;
2074
0
  rcDtParamTmp4.org.buf += 4;
2075
0
  rcDtParamTmp4.cur.buf -= 4;
2076
  
2077
0
  cost[0] = (RdCost::xGetSAD8(rcDtParamTmp0)) >> 1;
2078
0
  cost[1] = (RdCost::xGetSAD8(rcDtParamTmp1)) >> 1;
2079
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD8(rcDtParamTmp2)) >> 1;
2080
0
  cost[3] = (RdCost::xGetSAD8(rcDtParamTmp3)) >> 1;
2081
0
  cost[4] = (RdCost::xGetSAD8(rcDtParamTmp4)) >> 1;
2082
0
}
2083
2084
0
void RdCost::xGetSAD16X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2085
0
  DistParam rcDtParamTmp0 = rcDtParam;
2086
2087
0
  DistParam rcDtParamTmp1 = rcDtParam;
2088
0
  rcDtParamTmp1.org.buf += 1;
2089
0
  rcDtParamTmp1.cur.buf -= 1;
2090
2091
0
  DistParam rcDtParamTmp2 = rcDtParam;
2092
0
  rcDtParamTmp2.org.buf += 2;
2093
0
  rcDtParamTmp2.cur.buf -= 2;
2094
2095
0
  DistParam rcDtParamTmp3 = rcDtParam;
2096
0
  rcDtParamTmp3.org.buf += 3;
2097
0
  rcDtParamTmp3.cur.buf -= 3;
2098
2099
0
  DistParam rcDtParamTmp4 = rcDtParam;
2100
0
  rcDtParamTmp4.org.buf += 4;
2101
0
  rcDtParamTmp4.cur.buf -= 4;
2102
  
2103
0
  cost[0] = (RdCost::xGetSAD16(rcDtParamTmp0)) >> 1;
2104
0
  cost[1] = (RdCost::xGetSAD16(rcDtParamTmp1)) >> 1;
2105
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD16(rcDtParamTmp2)) >> 1;
2106
0
  cost[3] = (RdCost::xGetSAD16(rcDtParamTmp3)) >> 1;
2107
0
  cost[4] = (RdCost::xGetSAD16(rcDtParamTmp4)) >> 1;
2108
0
}
2109
2110
void RdCost::setDistParamGeo(DistParam &rcDP, const CPelBuf &org, const Pel *piRefY, int iRefStride, const Pel *mask,
2111
                          int iMaskStride, int stepX, int iMaskStride2, int bitDepth, ComponentID compID)
2112
0
{
2113
0
  rcDP.bitDepth = bitDepth;
2114
0
  rcDP.compID   = compID;
2115
2116
  // set Original & Curr Pointer / Stride
2117
0
  rcDP.org        = org;
2118
0
  rcDP.cur.buf    = piRefY;
2119
0
  rcDP.cur.stride = iRefStride;
2120
2121
  // set Mask
2122
0
  rcDP.mask        = mask;
2123
0
  rcDP.maskStride  = iMaskStride;
2124
0
  rcDP.stepX       = stepX;
2125
0
  rcDP.maskStride2 = iMaskStride2;
2126
2127
  // set Block Width / Height
2128
0
  rcDP.cur.width                     = org.width;
2129
0
  rcDP.cur.height                    = org.height;
2130
0
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
2131
2132
  // set Cost function for motion estimation with Mask
2133
0
  rcDP.distFunc = m_afpDistortFunc[0][DF_SAD_WITH_MASK];
2134
0
}
2135
2136
Distortion RdCost::xGetSADwMask(const DistParam &rcDtParam)
2137
0
{
2138
0
  const Pel *    org             = rcDtParam.org.buf;
2139
0
  const Pel *    cur             = rcDtParam.cur.buf;
2140
0
  const Pel *    mask            = rcDtParam.mask;
2141
0
  const int      cols            = rcDtParam.org.width;
2142
0
  int            rows            = rcDtParam.org.height;
2143
0
  const int      subShift        = rcDtParam.subShift;
2144
0
  const int      subStep         = (1 << subShift);
2145
0
  const int      strideCur       = rcDtParam.cur.stride * subStep;
2146
0
  const int      strideOrg       = rcDtParam.org.stride * subStep;
2147
0
  const int      strideMask      = rcDtParam.maskStride * subStep;
2148
0
  const int      stepX           = rcDtParam.stepX;
2149
0
  const int      strideMask2     = rcDtParam.maskStride2;
2150
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2151
2152
0
  Distortion sum = 0;
2153
0
  for (; rows != 0; rows -= subStep)
2154
0
  {
2155
0
    for (int n = 0; n < cols; n++)
2156
0
    {
2157
0
      sum += abs(org[n] - cur[n]) * *mask;
2158
0
      mask += stepX;
2159
0
    }
2160
0
    org += strideOrg;
2161
0
    cur += strideCur;
2162
0
    mask += strideMask;
2163
0
    mask += strideMask2;
2164
0
  }
2165
0
  sum <<= subShift;
2166
0
  return (sum >> distortionShift);
2167
0
}
2168
2169
Distortion RdCost::getBvCostMultiplePredsIBC(int x, int y, bool useIMV)
2170
1.10M
{
2171
1.10M
  return Distortion(m_dCostIBC * getBitsMultiplePredsIBC(x, y, useIMV));
2172
1.10M
}
2173
2174
static inline unsigned getIComponentBitsIBC( int val )
2175
3.02M
{
2176
3.02M
  if( !val ) return 1;
2177
2178
18.4E
  const unsigned int l2 = floorLog2( (val <= 0) ? (-val << 1) + 1 : (val << 1) );
2179
2180
1.52M
  return (l2 << 1) + 1;
2181
3.02M
}
2182
2183
unsigned int RdCost::getBitsMultiplePredsIBC(int x, int y, bool useIMV)
2184
1.10M
{
2185
1.10M
  int rmvH[2];
2186
1.10M
  int rmvV[2];
2187
1.10M
  rmvH[0] = x - m_bvPredictors[0].hor;
2188
1.10M
  rmvH[1] = x - m_bvPredictors[1].hor;
2189
2190
1.10M
  rmvV[0] = y - m_bvPredictors[0].ver;
2191
1.10M
  rmvV[1] = y - m_bvPredictors[1].ver;
2192
1.10M
  int absCand[2];
2193
1.10M
  absCand[0] = abs(rmvH[0]) + abs(rmvV[0]);
2194
1.10M
  absCand[1] = abs(rmvH[1]) + abs(rmvV[1]);
2195
2196
1.10M
  if (useIMV && x % 4 == 0 && y % 4 == 0)
2197
404k
  {
2198
404k
    int rmvHQP[2];
2199
404k
    int rmvVQP[2];
2200
2201
404k
    int imvShift = 2;
2202
404k
    int offset = 1 << (imvShift - 1);
2203
2204
404k
    rmvHQP[0] = (x >> 2) - ((m_bvPredictors[0].hor + offset) >> 2);
2205
404k
    rmvHQP[1] = (x >> 2) - ((m_bvPredictors[1].hor + offset) >> 2);
2206
404k
    rmvVQP[0] = (y >> 2) - ((m_bvPredictors[0].ver + offset) >> 2);
2207
404k
    rmvVQP[1] = (y >> 2) - ((m_bvPredictors[1].ver + offset) >> 2);
2208
2209
404k
    int absCandQP[2];
2210
404k
    absCandQP[0] = abs(rmvHQP[0]) + abs(rmvVQP[0]);
2211
404k
    absCandQP[1] = abs(rmvHQP[1]) + abs(rmvVQP[1]);
2212
404k
    unsigned int candBits0QP, candBits1QP;
2213
404k
    if (absCand[0] < absCand[1])
2214
0
    {
2215
0
      unsigned int candBits0 = getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2216
0
      if (absCandQP[0] < absCandQP[1])
2217
0
      {
2218
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2219
0
        return candBits0QP < candBits0 ? candBits0QP : candBits0;
2220
0
      }
2221
0
      else
2222
0
      {
2223
0
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2224
0
        return candBits1QP < candBits0 ? candBits1QP : candBits0;
2225
0
      }
2226
0
    }
2227
404k
    else
2228
404k
    {
2229
404k
      unsigned int candBits1 = getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2230
404k
      if (absCandQP[0] < absCandQP[1])
2231
0
      {
2232
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2233
0
        return candBits0QP < candBits1 ? candBits0QP : candBits1;
2234
0
      }
2235
404k
      else
2236
404k
      {
2237
404k
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2238
18.4E
        return candBits1QP < candBits1 ? candBits1QP : candBits1;
2239
404k
      }
2240
404k
    }
2241
404k
  }
2242
700k
  else
2243
700k
  {
2244
700k
    if (absCand[0] < absCand[1])
2245
0
    {
2246
0
      return getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2247
0
    }
2248
700k
    else
2249
700k
    {
2250
700k
      return getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2251
700k
    }
2252
700k
  }
2253
1.10M
}
2254
2255
} // namespace vvenc
2256
2257
//! \}
2258