Coverage Report

Created: 2026-05-30 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/CommonLib/RdCost.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     RdCost.cpp
45
    \brief    RD cost computation class
46
*/
47
48
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP
49
50
#include "RdCost.h"
51
#include "Rom.h"
52
#include "UnitPartitioner.h"
53
#include "SearchSpaceCounter.h"
54
55
56
//! \ingroup CommonLib
57
//! \{
58
59
namespace vvenc {
60
61
62
template<int csx>
63
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights );
64
65
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedWeight );
66
67
RdCost::RdCost()
68
20.7k
  : m_afpDistortFunc{ { nullptr, }, { nullptr, } }
69
20.7k
{
70
20.7k
}
71
72
RdCost::~RdCost()
73
20.7k
{
74
20.7k
}
75
76
void RdCost::setLambda( double dLambda, const BitDepths &bitDepths )
77
18.1k
{
78
18.1k
  m_dLambda          = dLambda;
79
18.1k
  m_DistScale        = double(1<<SCALE_BITS) / m_dLambda;
80
18.1k
  m_dLambdaMotionSAD = sqrt(m_dLambda);
81
18.1k
}
82
83
84
// Initialize Function Pointer by [eDFunc]
85
void RdCost::create( bool enableOpt )
86
20.7k
{
87
20.7k
  m_signalType                 = RESHAPE_SIGNAL_NULL;
88
20.7k
  m_chromaWeight               = 1.0;
89
20.7k
  m_lumaBD                     = 10;
90
20.7k
  m_afpDistortFunc[0][DF_SSE    ] = RdCost::xGetSSE;
91
20.7k
  m_afpDistortFunc[0][DF_SSE2   ] = RdCost::xGetSSE;
92
20.7k
  m_afpDistortFunc[0][DF_SSE4   ] = RdCost::xGetSSE4;
93
20.7k
  m_afpDistortFunc[0][DF_SSE8   ] = RdCost::xGetSSE8;
94
20.7k
  m_afpDistortFunc[0][DF_SSE16  ] = RdCost::xGetSSE16;
95
20.7k
  m_afpDistortFunc[0][DF_SSE32  ] = RdCost::xGetSSE32;
96
20.7k
  m_afpDistortFunc[0][DF_SSE64  ] = RdCost::xGetSSE64;
97
20.7k
  m_afpDistortFunc[0][DF_SSE128 ] = RdCost::xGetSSE128;
98
99
20.7k
  m_afpDistortFunc[0][DF_SAD    ] = RdCost::xGetSAD;
100
20.7k
  m_afpDistortFunc[0][DF_SAD2   ] = RdCost::xGetSAD;
101
20.7k
  m_afpDistortFunc[0][DF_SAD4   ] = RdCost::xGetSAD4;
102
20.7k
  m_afpDistortFunc[0][DF_SAD8   ] = RdCost::xGetSAD8;
103
20.7k
  m_afpDistortFunc[0][DF_SAD16  ] = RdCost::xGetSAD16;
104
20.7k
  m_afpDistortFunc[0][DF_SAD32  ] = RdCost::xGetSAD32;
105
20.7k
  m_afpDistortFunc[0][DF_SAD64  ] = RdCost::xGetSAD64;
106
20.7k
  m_afpDistortFunc[0][DF_SAD128 ] = RdCost::xGetSAD128;
107
108
20.7k
  m_afpDistortFunc[0][DF_HAD    ] = RdCost::xGetHADs<false>;
109
20.7k
  m_afpDistortFunc[0][DF_HAD2   ] = RdCost::xGetHADs<false>;
110
20.7k
  m_afpDistortFunc[0][DF_HAD4   ] = RdCost::xGetHADs<false>;
111
20.7k
  m_afpDistortFunc[0][DF_HAD8   ] = RdCost::xGetHADs<false>;
112
20.7k
  m_afpDistortFunc[0][DF_HAD16  ] = RdCost::xGetHADs<false>;
113
20.7k
  m_afpDistortFunc[0][DF_HAD32  ] = RdCost::xGetHADs<false>;
114
20.7k
  m_afpDistortFunc[0][DF_HAD64  ] = RdCost::xGetHADs<false>;
115
20.7k
  m_afpDistortFunc[0][DF_HAD128 ] = RdCost::xGetHADs<false>;
116
117
20.7k
  m_afpDistortFunc[0][DF_HAD_fast    ] = RdCost::xGetHADs<true>;
118
20.7k
  m_afpDistortFunc[0][DF_HAD2_fast   ] = RdCost::xGetHADs<true>;
119
20.7k
  m_afpDistortFunc[0][DF_HAD4_fast   ] = RdCost::xGetHADs<true>;
120
20.7k
  m_afpDistortFunc[0][DF_HAD8_fast   ] = RdCost::xGetHADs<true>;
121
20.7k
  m_afpDistortFunc[0][DF_HAD16_fast  ] = RdCost::xGetHADs<true>;
122
20.7k
  m_afpDistortFunc[0][DF_HAD32_fast  ] = RdCost::xGetHADs<true>;
123
20.7k
  m_afpDistortFunc[0][DF_HAD64_fast  ] = RdCost::xGetHADs<true>;
124
20.7k
  m_afpDistortFunc[0][DF_HAD128_fast ] = RdCost::xGetHADs<true>;
125
126
  //  m_afpDistortFunc[0][DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;
127
20.7k
  m_afpDistortFunc[0][DF_HAD_2SAD ] = RdCost::xGetHAD2SADs;
128
129
20.7k
  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = RdCost::xGetSADwMask;
130
  // m_afpDistortFunc[1] can be used in any case
131
20.7k
  memcpy( m_afpDistortFunc[1], m_afpDistortFunc[0], sizeof(m_afpDistortFunc)/2);
132
133
20.7k
  m_wtdPredPtr[0] = lumaWeightedSSE_Core<0>;
134
20.7k
  m_wtdPredPtr[1] = lumaWeightedSSE_Core<1>;
135
20.7k
  m_fxdWtdPredPtr = fixWeightedSSE_Core;
136
137
20.7k
  m_afpDistortFuncX5[0] = RdCost::xGetSAD8X5;
138
20.7k
  m_afpDistortFuncX5[1] = RdCost::xGetSAD16X5;
139
140
20.7k
#if ENABLE_SIMD_OPT_DIST
141
20.7k
  if( enableOpt )
142
20.7k
  {
143
#ifdef TARGET_SIMD_X86
144
    initRdCostX86();
145
#endif
146
#ifdef TARGET_SIMD_ARM
147
    initRdCostARM();
148
#endif
149
20.7k
  }
150
20.7k
#endif
151
152
20.7k
  m_costMode      = VVENC_COST_STANDARD_LOSSY;
153
20.7k
  m_motionLambda  = 0;
154
20.7k
  m_iCostScale    = 0;
155
20.7k
}
156
157
#if ENABLE_MEASURE_SEARCH_SPACE
158
static Distortion xMeasurePredSearchSpaceInterceptor( const DistParam& dp )
159
{
160
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
161
  return dp.xDistFunc( dp );
162
}
163
164
#endif
165
void RdCost::setDistParam( DistParam &rcDP, const CPelBuf& org, const Pel* piRefY, int iRefStride, int bitDepth, ComponentID compID, int subShiftMode, int useHadamard )
166
54.3k
{
167
54.3k
  rcDP.bitDepth   = bitDepth;
168
54.3k
  rcDP.compID     = compID;
169
170
  // set Original & Curr Pointer / Stride
171
54.3k
  rcDP.org        = org;
172
173
54.3k
  rcDP.cur.buf    = piRefY;
174
54.3k
  rcDP.cur.stride = iRefStride;
175
176
  // set Block Width / Height
177
54.3k
  rcDP.cur.width    = org.width;
178
54.3k
  rcDP.cur.height   = org.height;
179
54.3k
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
180
181
54.3k
  const int base = (rcDP.bitDepth > 10 || rcDP.applyWeight) ? 1 : 0;
182
54.3k
  if( !useHadamard )
183
54.3k
  {
184
54.3k
    rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( org.width ) ];
185
54.3k
  }
186
0
  else
187
0
  {
188
0
    rcDP.distFunc = m_afpDistortFunc[base][( useHadamard == 1 ? DF_HAD : DF_HAD_fast ) + Log2( org.width ) ];
189
0
  }
190
191
  // initialize
192
54.3k
  rcDP.subShift  = 0;
193
194
54.3k
  if( subShiftMode == 1 )
195
0
  {
196
0
    if( rcDP.org.height > 8 && rcDP.org.width <= 128 )
197
0
    {
198
0
      rcDP.subShift = 1;
199
0
    }
200
0
  }
201
54.3k
  else if( subShiftMode == 2 )
202
0
  {
203
0
    if (rcDP.org.height > 8)
204
0
    {
205
0
      rcDP.subShift = 1;
206
0
    }
207
0
  }
208
209
#if ENABLE_MEASURE_SEARCH_SPACE
210
  rcDP.xDistFunc = rcDP.distFunc;
211
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
212
#endif
213
54.3k
}
214
215
216
DistParam RdCost::setDistParam( const CPelBuf& org, const CPelBuf& cur, int bitDepth, DFunc dfunc )
217
298k
{
218
298k
  int index = dfunc;
219
298k
  if( dfunc != DF_HAD && dfunc != DF_HAD_fast && dfunc != DF_HAD_2SAD )
220
121k
  {
221
121k
    index += Log2(org.width);
222
121k
  }
223
224
298k
  const int base = bitDepth > 10 ? 1:0; //TBD: check does SDA ever overflow
225
#if ENABLE_MEASURE_SEARCH_SPACE
226
  DistParam rcDP( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
227
  rcDP.xDistFunc = rcDP.distFunc;
228
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
229
  return rcDP;
230
#else
231
298k
  return DistParam( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
232
298k
#endif
233
298k
}
234
235
DistParam RdCost::setDistParam( const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShift, bool isDMVR )
236
0
{
237
0
  DistParam rcDP;
238
0
  rcDP.bitDepth   = bitDepth;
239
0
  rcDP.compID     = compID;
240
241
0
  rcDP.org.buf    = pOrg;
242
0
  rcDP.org.stride = iOrgStride;
243
0
  rcDP.org.width  = width;
244
0
  rcDP.org.height = height;
245
246
0
  rcDP.cur.buf    = piRefY;
247
0
  rcDP.cur.stride = iRefStride;
248
0
  rcDP.cur.width  = width;
249
0
  rcDP.cur.height = height;
250
0
  rcDP.subShift   = subShift;
251
252
  //  CHECK( useHadamard || rcDP.useMR, "only used in xDMVRCost with these default parameters (so far...)" );
253
0
  const int base = (rcDP.bitDepth > 10) ? 1 : 0;
254
255
0
  rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( width ) ];
256
  
257
0
  if( isDMVR )
258
0
  {
259
0
    rcDP.dmvrSadX5 = m_afpDistortFuncX5[Log2( width ) - 3];
260
0
  }
261
262
#if ENABLE_MEASURE_SEARCH_SPACE
263
  if( !isDMVR )
264
  {
265
    // DMVT is part of the decoder complexity
266
    rcDP.xDistFunc = rcDP.distFunc;
267
    rcDP.distFunc = xMeasurePredSearchSpaceInterceptor;
268
  }
269
270
#endif
271
0
  return rcDP;
272
0
}
273
274
Distortion RdCost::getDistPart( const CPelBuf& org, const CPelBuf& cur, int bitDepth, const ComponentID compId, DFunc eDFunc, const CPelBuf* orgLuma )
275
2.92M
{
276
2.92M
  DistParam dp( org, cur, nullptr, bitDepth, 0, compId );
277
# if ENABLE_MEASURE_SEARCH_SPACE
278
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
279
#endif
280
2.92M
  Distortion dist;
281
2.92M
  if( orgLuma )
282
0
  {
283
0
    CHECKD( eDFunc != DF_SSE_WTD, "mismatch func and parameter")
284
0
    dp.orgLuma  = orgLuma;
285
0
    dist = RdCost::xGetSSE_WTD( dp );
286
0
  }
287
2.92M
  else
288
2.92M
  {
289
2.92M
    if( ( org.width == 1 ) )
290
0
    {
291
0
      dist = xGetSSE( dp );
292
0
    }
293
2.92M
    else
294
2.92M
    {
295
2.92M
      const int base = (bitDepth > 10) ? 1 : 0;
296
2.92M
      dist = m_afpDistortFunc[base][eDFunc + Log2(org.width)](dp);
297
2.92M
    }
298
2.92M
  }
299
2.92M
  if (isChroma(compId))
300
2.44M
  {
301
2.44M
    return ((Distortion) (m_distortionWeight[ compId ] * dist));
302
2.44M
  }
303
473k
  else
304
473k
  {
305
473k
    return dist;
306
473k
  }
307
2.92M
}
308
309
// ====================================================================================================================
310
// Distortion functions
311
// ====================================================================================================================
312
313
// --------------------------------------------------------------------------------------------------------------------
314
// SAD
315
// --------------------------------------------------------------------------------------------------------------------
316
317
Distortion RdCost::xGetSAD( const DistParam& rcDtParam )
318
0
{
319
0
  if ( rcDtParam.applyWeight )
320
0
  {
321
0
    THROW(" no support");
322
0
  }
323
324
0
  const Pel* piOrg           = rcDtParam.org.buf;
325
0
  const Pel* piCur           = rcDtParam.cur.buf;
326
0
  const int  iCols           = rcDtParam.org.width;
327
0
        int  iRows           = rcDtParam.org.height;
328
0
  const int  iSubShift       = rcDtParam.subShift;
329
0
  const int  iSubStep        = ( 1 << iSubShift );
330
0
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
331
0
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
332
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
333
334
0
  Distortion uiSum = 0;
335
336
0
  for( ; iRows != 0; iRows -= iSubStep )
337
0
  {
338
0
    for (int n = 0; n < iCols; n++ )
339
0
    {
340
0
      uiSum += abs( piOrg[n] - piCur[n] );
341
0
    }
342
0
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
343
0
    {
344
0
      return ( uiSum >> distortionShift );
345
0
    }
346
0
    piOrg += iStrideOrg;
347
0
    piCur += iStrideCur;
348
0
  }
349
350
0
  uiSum <<= iSubShift;
351
0
  return ( uiSum >> distortionShift );
352
0
}
353
354
Distortion RdCost::xGetSAD4( const DistParam& rcDtParam )
355
138k
{
356
138k
  if ( rcDtParam.applyWeight )
357
0
  {
358
0
    THROW(" no support");
359
0
  }
360
361
138k
  const Pel* piOrg   = rcDtParam.org.buf;
362
138k
  const Pel* piCur   = rcDtParam.cur.buf;
363
138k
  int  iRows         = rcDtParam.org.height;
364
138k
  int  iSubShift     = rcDtParam.subShift;
365
138k
  int  iSubStep      = ( 1 << iSubShift );
366
138k
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
367
138k
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;
368
369
138k
  Distortion uiSum = 0;
370
371
1.72M
  for( ; iRows != 0; iRows -= iSubStep )
372
1.58M
  {
373
1.58M
    uiSum += abs( piOrg[0] - piCur[0] );
374
1.58M
    uiSum += abs( piOrg[1] - piCur[1] );
375
1.58M
    uiSum += abs( piOrg[2] - piCur[2] );
376
1.58M
    uiSum += abs( piOrg[3] - piCur[3] );
377
378
1.58M
    piOrg += iStrideOrg;
379
1.58M
    piCur += iStrideCur;
380
1.58M
  }
381
382
138k
  uiSum <<= iSubShift;
383
138k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
384
138k
}
385
386
Distortion RdCost::xGetSAD8( const DistParam& rcDtParam )
387
580k
{
388
580k
  if ( rcDtParam.applyWeight )
389
0
  {
390
0
    THROW(" no support");
391
0
  }
392
393
580k
  const Pel* piOrg      = rcDtParam.org.buf;
394
580k
  const Pel* piCur      = rcDtParam.cur.buf;
395
580k
  int  iRows            = rcDtParam.org.height;
396
580k
  int  iSubShift        = rcDtParam.subShift;
397
580k
  int  iSubStep         = ( 1 << iSubShift );
398
580k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
399
580k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
400
401
580k
  Distortion uiSum = 0;
402
403
12.4M
  for( ; iRows != 0; iRows-=iSubStep )
404
11.8M
  {
405
11.8M
    uiSum += abs( piOrg[0] - piCur[0] );
406
11.8M
    uiSum += abs( piOrg[1] - piCur[1] );
407
11.8M
    uiSum += abs( piOrg[2] - piCur[2] );
408
11.8M
    uiSum += abs( piOrg[3] - piCur[3] );
409
11.8M
    uiSum += abs( piOrg[4] - piCur[4] );
410
11.8M
    uiSum += abs( piOrg[5] - piCur[5] );
411
11.8M
    uiSum += abs( piOrg[6] - piCur[6] );
412
11.8M
    uiSum += abs( piOrg[7] - piCur[7] );
413
414
11.8M
    piOrg += iStrideOrg;
415
11.8M
    piCur += iStrideCur;
416
11.8M
  }
417
418
580k
  uiSum <<= iSubShift;
419
580k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
420
580k
}
421
422
Distortion RdCost::xGetSAD16( const DistParam& rcDtParam )
423
495k
{
424
495k
  if ( rcDtParam.applyWeight )
425
0
  {
426
0
    THROW(" no support");
427
0
  }
428
429
495k
  const Pel* piOrg      = rcDtParam.org.buf;
430
495k
  const Pel* piCur      = rcDtParam.cur.buf;
431
495k
  int  iRows            = rcDtParam.org.height;
432
495k
  int  iSubShift        = rcDtParam.subShift;
433
495k
  int  iSubStep         = ( 1 << iSubShift );
434
495k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
435
495k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
436
437
495k
  Distortion uiSum = 0;
438
439
11.0M
  for( ; iRows != 0; iRows -= iSubStep )
440
10.5M
  {
441
10.5M
    uiSum += abs( piOrg[0] - piCur[0] );
442
10.5M
    uiSum += abs( piOrg[1] - piCur[1] );
443
10.5M
    uiSum += abs( piOrg[2] - piCur[2] );
444
10.5M
    uiSum += abs( piOrg[3] - piCur[3] );
445
10.5M
    uiSum += abs( piOrg[4] - piCur[4] );
446
10.5M
    uiSum += abs( piOrg[5] - piCur[5] );
447
10.5M
    uiSum += abs( piOrg[6] - piCur[6] );
448
10.5M
    uiSum += abs( piOrg[7] - piCur[7] );
449
10.5M
    uiSum += abs( piOrg[8] - piCur[8] );
450
10.5M
    uiSum += abs( piOrg[9] - piCur[9] );
451
10.5M
    uiSum += abs( piOrg[10] - piCur[10] );
452
10.5M
    uiSum += abs( piOrg[11] - piCur[11] );
453
10.5M
    uiSum += abs( piOrg[12] - piCur[12] );
454
10.5M
    uiSum += abs( piOrg[13] - piCur[13] );
455
10.5M
    uiSum += abs( piOrg[14] - piCur[14] );
456
10.5M
    uiSum += abs( piOrg[15] - piCur[15] );
457
458
10.5M
    piOrg += iStrideOrg;
459
10.5M
    piCur += iStrideCur;
460
10.5M
  }
461
462
495k
  uiSum <<= iSubShift;
463
495k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
464
495k
}
465
466
467
Distortion RdCost::xGetSAD128( const DistParam &rcDtParam )
468
0
{
469
0
  const Pel* piOrg  = rcDtParam.org.buf;
470
0
  const Pel* piCur  = rcDtParam.cur.buf;
471
0
  int  iRows        = rcDtParam.org.height;
472
0
  int  iCols        = rcDtParam.org.width;
473
0
  int  iSubShift    = rcDtParam.subShift;
474
0
  int  iSubStep     = ( 1 << iSubShift );
475
0
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
476
0
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;
477
478
0
  Distortion uiSum = 0;
479
480
0
  for( ; iRows != 0; iRows-=iSubStep )
481
0
  {
482
0
    for (int n = 0; n < iCols; n+=16 )
483
0
    {
484
0
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
485
0
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
486
0
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
487
0
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
488
0
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
489
0
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
490
0
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
491
0
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
492
0
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
493
0
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
494
0
      uiSum += abs( piOrg[n+10] - piCur[n+10] );
495
0
      uiSum += abs( piOrg[n+11] - piCur[n+11] );
496
0
      uiSum += abs( piOrg[n+12] - piCur[n+12] );
497
0
      uiSum += abs( piOrg[n+13] - piCur[n+13] );
498
0
      uiSum += abs( piOrg[n+14] - piCur[n+14] );
499
0
      uiSum += abs( piOrg[n+15] - piCur[n+15] );
500
0
    }
501
0
    piOrg += iStrideOrg;
502
0
    piCur += iStrideCur;
503
0
  }
504
505
0
  uiSum <<= iSubShift;
506
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
507
0
}
508
509
Distortion RdCost::xGetSAD32( const DistParam &rcDtParam )
510
750k
{
511
750k
  if ( rcDtParam.applyWeight )
512
0
  {
513
0
    THROW(" no support");
514
0
  }
515
516
750k
  const Pel* piOrg      = rcDtParam.org.buf;
517
750k
  const Pel* piCur      = rcDtParam.cur.buf;
518
750k
  int  iRows            = rcDtParam.org.height;
519
750k
  int  iSubShift        = rcDtParam.subShift;
520
750k
  int  iSubStep         = ( 1 << iSubShift );
521
750k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
522
750k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
523
524
750k
  Distortion uiSum = 0;
525
526
14.8M
  for( ; iRows != 0; iRows-=iSubStep )
527
14.1M
  {
528
14.1M
    uiSum += abs( piOrg[0] - piCur[0] );
529
14.1M
    uiSum += abs( piOrg[1] - piCur[1] );
530
14.1M
    uiSum += abs( piOrg[2] - piCur[2] );
531
14.1M
    uiSum += abs( piOrg[3] - piCur[3] );
532
14.1M
    uiSum += abs( piOrg[4] - piCur[4] );
533
14.1M
    uiSum += abs( piOrg[5] - piCur[5] );
534
14.1M
    uiSum += abs( piOrg[6] - piCur[6] );
535
14.1M
    uiSum += abs( piOrg[7] - piCur[7] );
536
14.1M
    uiSum += abs( piOrg[8] - piCur[8] );
537
14.1M
    uiSum += abs( piOrg[9] - piCur[9] );
538
14.1M
    uiSum += abs( piOrg[10] - piCur[10] );
539
14.1M
    uiSum += abs( piOrg[11] - piCur[11] );
540
14.1M
    uiSum += abs( piOrg[12] - piCur[12] );
541
14.1M
    uiSum += abs( piOrg[13] - piCur[13] );
542
14.1M
    uiSum += abs( piOrg[14] - piCur[14] );
543
14.1M
    uiSum += abs( piOrg[15] - piCur[15] );
544
14.1M
    uiSum += abs( piOrg[16] - piCur[16] );
545
14.1M
    uiSum += abs( piOrg[17] - piCur[17] );
546
14.1M
    uiSum += abs( piOrg[18] - piCur[18] );
547
14.1M
    uiSum += abs( piOrg[19] - piCur[19] );
548
14.1M
    uiSum += abs( piOrg[20] - piCur[20] );
549
14.1M
    uiSum += abs( piOrg[21] - piCur[21] );
550
14.1M
    uiSum += abs( piOrg[22] - piCur[22] );
551
14.1M
    uiSum += abs( piOrg[23] - piCur[23] );
552
14.1M
    uiSum += abs( piOrg[24] - piCur[24] );
553
14.1M
    uiSum += abs( piOrg[25] - piCur[25] );
554
14.1M
    uiSum += abs( piOrg[26] - piCur[26] );
555
14.1M
    uiSum += abs( piOrg[27] - piCur[27] );
556
14.1M
    uiSum += abs( piOrg[28] - piCur[28] );
557
14.1M
    uiSum += abs( piOrg[29] - piCur[29] );
558
14.1M
    uiSum += abs( piOrg[30] - piCur[30] );
559
14.1M
    uiSum += abs( piOrg[31] - piCur[31] );
560
561
14.1M
    piOrg += iStrideOrg;
562
14.1M
    piCur += iStrideCur;
563
14.1M
  }
564
565
750k
  uiSum <<= iSubShift;
566
750k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
567
750k
}
568
569
570
Distortion RdCost::xGetSAD64( const DistParam &rcDtParam )
571
9.32k
{
572
9.32k
  if ( rcDtParam.applyWeight )
573
0
  {
574
0
    THROW(" no support");
575
0
  }
576
577
9.32k
  const Pel* piOrg      = rcDtParam.org.buf;
578
9.32k
  const Pel* piCur      = rcDtParam.cur.buf;
579
9.32k
  int  iRows            = rcDtParam.org.height;
580
9.32k
  int  iSubShift        = rcDtParam.subShift;
581
9.32k
  int  iSubStep         = ( 1 << iSubShift );
582
9.32k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
583
9.32k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
584
585
9.32k
  Distortion uiSum = 0;
586
587
605k
  for( ; iRows != 0; iRows-=iSubStep )
588
596k
  {
589
596k
    uiSum += abs( piOrg[0] - piCur[0] );
590
596k
    uiSum += abs( piOrg[1] - piCur[1] );
591
596k
    uiSum += abs( piOrg[2] - piCur[2] );
592
596k
    uiSum += abs( piOrg[3] - piCur[3] );
593
596k
    uiSum += abs( piOrg[4] - piCur[4] );
594
596k
    uiSum += abs( piOrg[5] - piCur[5] );
595
596k
    uiSum += abs( piOrg[6] - piCur[6] );
596
596k
    uiSum += abs( piOrg[7] - piCur[7] );
597
596k
    uiSum += abs( piOrg[8] - piCur[8] );
598
596k
    uiSum += abs( piOrg[9] - piCur[9] );
599
596k
    uiSum += abs( piOrg[10] - piCur[10] );
600
596k
    uiSum += abs( piOrg[11] - piCur[11] );
601
596k
    uiSum += abs( piOrg[12] - piCur[12] );
602
596k
    uiSum += abs( piOrg[13] - piCur[13] );
603
596k
    uiSum += abs( piOrg[14] - piCur[14] );
604
596k
    uiSum += abs( piOrg[15] - piCur[15] );
605
596k
    uiSum += abs( piOrg[16] - piCur[16] );
606
596k
    uiSum += abs( piOrg[17] - piCur[17] );
607
596k
    uiSum += abs( piOrg[18] - piCur[18] );
608
596k
    uiSum += abs( piOrg[19] - piCur[19] );
609
596k
    uiSum += abs( piOrg[20] - piCur[20] );
610
596k
    uiSum += abs( piOrg[21] - piCur[21] );
611
596k
    uiSum += abs( piOrg[22] - piCur[22] );
612
596k
    uiSum += abs( piOrg[23] - piCur[23] );
613
596k
    uiSum += abs( piOrg[24] - piCur[24] );
614
596k
    uiSum += abs( piOrg[25] - piCur[25] );
615
596k
    uiSum += abs( piOrg[26] - piCur[26] );
616
596k
    uiSum += abs( piOrg[27] - piCur[27] );
617
596k
    uiSum += abs( piOrg[28] - piCur[28] );
618
596k
    uiSum += abs( piOrg[29] - piCur[29] );
619
596k
    uiSum += abs( piOrg[30] - piCur[30] );
620
596k
    uiSum += abs( piOrg[31] - piCur[31] );
621
596k
    uiSum += abs( piOrg[32] - piCur[32] );
622
596k
    uiSum += abs( piOrg[33] - piCur[33] );
623
596k
    uiSum += abs( piOrg[34] - piCur[34] );
624
596k
    uiSum += abs( piOrg[35] - piCur[35] );
625
596k
    uiSum += abs( piOrg[36] - piCur[36] );
626
596k
    uiSum += abs( piOrg[37] - piCur[37] );
627
596k
    uiSum += abs( piOrg[38] - piCur[38] );
628
596k
    uiSum += abs( piOrg[39] - piCur[39] );
629
596k
    uiSum += abs( piOrg[40] - piCur[40] );
630
596k
    uiSum += abs( piOrg[41] - piCur[41] );
631
596k
    uiSum += abs( piOrg[42] - piCur[42] );
632
596k
    uiSum += abs( piOrg[43] - piCur[43] );
633
596k
    uiSum += abs( piOrg[44] - piCur[44] );
634
596k
    uiSum += abs( piOrg[45] - piCur[45] );
635
596k
    uiSum += abs( piOrg[46] - piCur[46] );
636
596k
    uiSum += abs( piOrg[47] - piCur[47] );
637
596k
    uiSum += abs( piOrg[48] - piCur[48] );
638
596k
    uiSum += abs( piOrg[49] - piCur[49] );
639
596k
    uiSum += abs( piOrg[50] - piCur[50] );
640
596k
    uiSum += abs( piOrg[51] - piCur[51] );
641
596k
    uiSum += abs( piOrg[52] - piCur[52] );
642
596k
    uiSum += abs( piOrg[53] - piCur[53] );
643
596k
    uiSum += abs( piOrg[54] - piCur[54] );
644
596k
    uiSum += abs( piOrg[55] - piCur[55] );
645
596k
    uiSum += abs( piOrg[56] - piCur[56] );
646
596k
    uiSum += abs( piOrg[57] - piCur[57] );
647
596k
    uiSum += abs( piOrg[58] - piCur[58] );
648
596k
    uiSum += abs( piOrg[59] - piCur[59] );
649
596k
    uiSum += abs( piOrg[60] - piCur[60] );
650
596k
    uiSum += abs( piOrg[61] - piCur[61] );
651
596k
    uiSum += abs( piOrg[62] - piCur[62] );
652
596k
    uiSum += abs( piOrg[63] - piCur[63] );
653
654
596k
    piOrg += iStrideOrg;
655
596k
    piCur += iStrideCur;
656
596k
  }
657
658
9.32k
  uiSum <<= iSubShift;
659
9.32k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
660
9.32k
}
661
662
663
// --------------------------------------------------------------------------------------------------------------------
664
// SSE
665
// --------------------------------------------------------------------------------------------------------------------
666
667
Distortion RdCost::xGetSSE( const DistParam &rcDtParam )
668
0
{
669
0
  if ( rcDtParam.applyWeight )
670
0
  {
671
0
    THROW(" no support");
672
0
  }
673
674
0
  const Pel* piOrg      = rcDtParam.org.buf;
675
0
  const Pel* piCur      = rcDtParam.cur.buf;
676
0
  int  iRows            = rcDtParam.org.height;
677
0
  int  iCols            = rcDtParam.org.width;
678
0
  int  iStrideCur       = rcDtParam.cur.stride;
679
0
  int  iStrideOrg       = rcDtParam.org.stride;
680
681
0
  Distortion uiSum   = 0;
682
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
683
684
0
  Intermediate_Int iTemp;
685
686
0
  for( ; iRows != 0; iRows-- )
687
0
  {
688
0
    for (int n = 0; n < iCols; n++ )
689
0
    {
690
0
      iTemp = piOrg[n  ] - piCur[n  ];
691
0
      uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
692
0
    }
693
0
    piOrg += iStrideOrg;
694
0
    piCur += iStrideCur;
695
0
  }
696
697
0
  return ( uiSum );
698
0
}
699
700
Distortion RdCost::xGetSSE4( const DistParam &rcDtParam )
701
589k
{
702
589k
  if ( rcDtParam.applyWeight )
703
0
  {
704
0
    CHECK( rcDtParam.org.width != 4, "Invalid size" );
705
0
    THROW(" no support");
706
0
  }
707
708
589k
  const Pel* piOrg   = rcDtParam.org.buf;
709
589k
  const Pel* piCur   = rcDtParam.cur.buf;
710
589k
  int  iRows         = rcDtParam.org.height;
711
589k
  int  iStrideOrg    = rcDtParam.org.stride;
712
589k
  int  iStrideCur    = rcDtParam.cur.stride;
713
714
589k
  Distortion uiSum   = 0;
715
589k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
716
717
589k
  Intermediate_Int  iTemp;
718
719
8.39M
  for( ; iRows != 0; iRows-- )
720
7.80M
  {
721
722
7.80M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
723
7.80M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
724
7.80M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
725
7.80M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
726
727
7.80M
    piOrg += iStrideOrg;
728
7.80M
    piCur += iStrideCur;
729
7.80M
  }
730
731
589k
  return ( uiSum );
732
589k
}
733
734
Distortion RdCost::xGetSSE8( const DistParam &rcDtParam )
735
901k
{
736
901k
  if ( rcDtParam.applyWeight )
737
0
  {
738
0
    CHECK( rcDtParam.org.width != 8, "Invalid size" );
739
0
    THROW(" no support");
740
0
  }
741
742
901k
  const Pel* piOrg   = rcDtParam.org.buf;
743
901k
  const Pel* piCur   = rcDtParam.cur.buf;
744
901k
  int  iRows         = rcDtParam.org.height;
745
901k
  int  iStrideOrg    = rcDtParam.org.stride;
746
901k
  int  iStrideCur    = rcDtParam.cur.stride;
747
748
901k
  Distortion uiSum   = 0;
749
901k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
750
751
901k
  Intermediate_Int  iTemp;
752
753
14.1M
  for( ; iRows != 0; iRows-- )
754
13.2M
  {
755
13.2M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
756
13.2M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
757
13.2M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
758
13.2M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
759
13.2M
    iTemp = piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
760
13.2M
    iTemp = piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
761
13.2M
    iTemp = piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
762
13.2M
    iTemp = piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
763
764
13.2M
    piOrg += iStrideOrg;
765
13.2M
    piCur += iStrideCur;
766
13.2M
  }
767
768
901k
  return ( uiSum );
769
901k
}
770
771
Distortion RdCost::xGetSSE16( const DistParam &rcDtParam )
772
741k
{
773
741k
  if ( rcDtParam.applyWeight )
774
0
  {
775
0
    CHECK( rcDtParam.org.width != 16, "Invalid size" );
776
0
    THROW(" no support");
777
0
  }
778
779
741k
  const Pel* piOrg   = rcDtParam.org.buf;
780
741k
  const Pel* piCur   = rcDtParam.cur.buf;
781
741k
  int  iRows         = rcDtParam.org.height;
782
741k
  int  iStrideOrg    = rcDtParam.org.stride;
783
741k
  int  iStrideCur    = rcDtParam.cur.stride;
784
785
741k
  Distortion uiSum   = 0;
786
741k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
787
788
741k
  Intermediate_Int  iTemp;
789
790
11.6M
  for( ; iRows != 0; iRows-- )
791
10.9M
  {
792
793
10.9M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
794
10.9M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
795
10.9M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
796
10.9M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
797
10.9M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
798
10.9M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
799
10.9M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
800
10.9M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
801
10.9M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
802
10.9M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
803
10.9M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
804
10.9M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
805
10.9M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
806
10.9M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
807
10.9M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
808
10.9M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
809
810
10.9M
    piOrg += iStrideOrg;
811
10.9M
    piCur += iStrideCur;
812
10.9M
  }
813
814
741k
  return ( uiSum );
815
741k
}
816
817
Distortion RdCost::xGetSSE128( const DistParam &rcDtParam )
818
0
{
819
0
  if ( rcDtParam.applyWeight )
820
0
  {
821
0
    THROW(" no support");
822
0
  }
823
0
  const Pel* piOrg   = rcDtParam.org.buf;
824
0
  const Pel* piCur   = rcDtParam.cur.buf;
825
0
  int  iRows         = rcDtParam.org.height;
826
0
  int  iCols         = rcDtParam.org.width;
827
0
  int  iStrideOrg    = rcDtParam.org.stride;
828
0
  int  iStrideCur    = rcDtParam.cur.stride;
829
830
0
  Distortion uiSum   = 0;
831
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
832
833
0
  Intermediate_Int  iTemp;
834
835
0
  for( ; iRows != 0; iRows-- )
836
0
  {
837
0
    for (int n = 0; n < iCols; n+=16 )
838
0
    {
839
840
0
      iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
841
0
      iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
842
0
      iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
843
0
      iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
844
0
      iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
845
0
      iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
846
0
      iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
847
0
      iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
848
0
      iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
849
0
      iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
850
0
      iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
851
0
      iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
852
0
      iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
853
0
      iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
854
0
      iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
855
0
      iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
856
857
0
    }
858
0
    piOrg += iStrideOrg;
859
0
    piCur += iStrideCur;
860
0
  }
861
862
0
  return ( uiSum );
863
0
}
864
865
Distortion RdCost::xGetSSE32( const DistParam &rcDtParam )
866
614k
{
867
614k
  if ( rcDtParam.applyWeight )
868
0
  {
869
0
    THROW(" no support");
870
0
  }
871
872
614k
  const Pel* piOrg   = rcDtParam.org.buf;
873
614k
  const Pel* piCur   = rcDtParam.cur.buf;
874
614k
  int  iRows         = rcDtParam.org.height;
875
614k
  int  iStrideOrg    = rcDtParam.org.stride;
876
614k
  int  iStrideCur    = rcDtParam.cur.stride;
877
878
614k
  Distortion uiSum   = 0;
879
614k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
880
881
614k
  Intermediate_Int  iTemp;
882
883
10.8M
  for( ; iRows != 0; iRows-- )
884
10.2M
  {
885
886
10.2M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
887
10.2M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
888
10.2M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
889
10.2M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
890
10.2M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
891
10.2M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
892
10.2M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
893
10.2M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
894
10.2M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
895
10.2M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
896
10.2M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
897
10.2M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
898
10.2M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
899
10.2M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
900
10.2M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
901
10.2M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
902
10.2M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
903
10.2M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
904
10.2M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
905
10.2M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
906
10.2M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
907
10.2M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
908
10.2M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
909
10.2M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
910
10.2M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
911
10.2M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
912
10.2M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
913
10.2M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
914
10.2M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
915
10.2M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
916
10.2M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
917
10.2M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
918
919
10.2M
    piOrg += iStrideOrg;
920
10.2M
    piCur += iStrideCur;
921
10.2M
  }
922
923
614k
  return ( uiSum );
924
614k
}
925
926
Distortion RdCost::xGetSSE64( const DistParam &rcDtParam )
927
73.4k
{
928
73.4k
  if ( rcDtParam.applyWeight )
929
0
  {
930
0
    THROW(" no support");
931
0
  }
932
933
73.4k
  const Pel* piOrg   = rcDtParam.org.buf;
934
73.4k
  const Pel* piCur   = rcDtParam.cur.buf;
935
73.4k
  int  iRows         = rcDtParam.org.height;
936
73.4k
  int  iStrideOrg    = rcDtParam.org.stride;
937
73.4k
  int  iStrideCur    = rcDtParam.cur.stride;
938
939
73.4k
  Distortion uiSum   = 0;
940
73.4k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
941
942
73.4k
  Intermediate_Int  iTemp;
943
944
3.91M
  for( ; iRows != 0; iRows-- )
945
3.83M
  {
946
3.83M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
947
3.83M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
948
3.83M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
949
3.83M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
950
3.83M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
951
3.83M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
952
3.83M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
953
3.83M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
954
3.83M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
955
3.83M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
956
3.83M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
957
3.83M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
958
3.83M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
959
3.83M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
960
3.83M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
961
3.83M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
962
3.83M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
963
3.83M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
964
3.83M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
965
3.83M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
966
3.83M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
967
3.83M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
968
3.83M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
969
3.83M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
970
3.83M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
971
3.83M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
972
3.83M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
973
3.83M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
974
3.83M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
975
3.83M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
976
3.83M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
977
3.83M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
978
3.83M
    iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
979
3.83M
    iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
980
3.83M
    iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
981
3.83M
    iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
982
3.83M
    iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
983
3.83M
    iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
984
3.83M
    iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
985
3.83M
    iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
986
3.83M
    iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
987
3.83M
    iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
988
3.83M
    iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
989
3.83M
    iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
990
3.83M
    iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
991
3.83M
    iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
992
3.83M
    iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
993
3.83M
    iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
994
3.83M
    iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
995
3.83M
    iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
996
3.83M
    iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
997
3.83M
    iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
998
3.83M
    iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
999
3.83M
    iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1000
3.83M
    iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1001
3.83M
    iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1002
3.83M
    iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1003
3.83M
    iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1004
3.83M
    iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1005
3.83M
    iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1006
3.83M
    iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1007
3.83M
    iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1008
3.83M
    iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1009
3.83M
    iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1010
1011
3.83M
    piOrg += iStrideOrg;
1012
3.83M
    piCur += iStrideCur;
1013
3.83M
  }
1014
1015
73.4k
  return ( uiSum );
1016
73.4k
}
1017
1018
// --------------------------------------------------------------------------------------------------------------------
1019
// HADAMARD with step (used in fractional search)
1020
// --------------------------------------------------------------------------------------------------------------------
1021
1022
Distortion RdCost::xCalcHADs2x2( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1023
168k
{
1024
168k
  Distortion satd = 0;
1025
168k
  TCoeff diff[4], m[4];
1026
1027
168k
  diff[0] = piOrg[0             ] - piCur[0];
1028
168k
  diff[1] = piOrg[1             ] - piCur[1];
1029
168k
  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
1030
168k
  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
1031
168k
  m[0] = diff[0] + diff[2];
1032
168k
  m[1] = diff[1] + diff[3];
1033
168k
  m[2] = diff[0] - diff[2];
1034
168k
  m[3] = diff[1] - diff[3];
1035
  
1036
168k
  satd += abs(m[0] + m[1]) >> 2;
1037
168k
  satd += abs(m[0] - m[1]);
1038
168k
  satd += abs(m[2] + m[3]);
1039
168k
  satd += abs(m[2] - m[3]);
1040
1041
168k
  return satd;
1042
168k
}
1043
1044
static Distortion xCalcHADs4x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1045
46.2k
{
1046
46.2k
  int k;
1047
46.2k
  Distortion satd = 0;
1048
46.2k
  TCoeff diff[16], m[16], d[16];
1049
1050
231k
  for( k = 0; k < 16; k+=4 )
1051
185k
  {
1052
185k
    diff[k+0] = piOrg[0] - piCur[0];
1053
185k
    diff[k+1] = piOrg[1] - piCur[1];
1054
185k
    diff[k+2] = piOrg[2] - piCur[2];
1055
185k
    diff[k+3] = piOrg[3] - piCur[3];
1056
1057
185k
    piCur += iStrideCur;
1058
185k
    piOrg += iStrideOrg;
1059
185k
  }
1060
1061
  /*===== hadamard transform =====*/
1062
46.2k
  m[ 0] = diff[ 0] + diff[12];
1063
46.2k
  m[ 1] = diff[ 1] + diff[13];
1064
46.2k
  m[ 2] = diff[ 2] + diff[14];
1065
46.2k
  m[ 3] = diff[ 3] + diff[15];
1066
46.2k
  m[ 4] = diff[ 4] + diff[ 8];
1067
46.2k
  m[ 5] = diff[ 5] + diff[ 9];
1068
46.2k
  m[ 6] = diff[ 6] + diff[10];
1069
46.2k
  m[ 7] = diff[ 7] + diff[11];
1070
46.2k
  m[ 8] = diff[ 4] - diff[ 8];
1071
46.2k
  m[ 9] = diff[ 5] - diff[ 9];
1072
46.2k
  m[10] = diff[ 6] - diff[10];
1073
46.2k
  m[11] = diff[ 7] - diff[11];
1074
46.2k
  m[12] = diff[ 0] - diff[12];
1075
46.2k
  m[13] = diff[ 1] - diff[13];
1076
46.2k
  m[14] = diff[ 2] - diff[14];
1077
46.2k
  m[15] = diff[ 3] - diff[15];
1078
1079
46.2k
  d[ 0] = m[ 0] + m[ 4];
1080
46.2k
  d[ 1] = m[ 1] + m[ 5];
1081
46.2k
  d[ 2] = m[ 2] + m[ 6];
1082
46.2k
  d[ 3] = m[ 3] + m[ 7];
1083
46.2k
  d[ 4] = m[ 8] + m[12];
1084
46.2k
  d[ 5] = m[ 9] + m[13];
1085
46.2k
  d[ 6] = m[10] + m[14];
1086
46.2k
  d[ 7] = m[11] + m[15];
1087
46.2k
  d[ 8] = m[ 0] - m[ 4];
1088
46.2k
  d[ 9] = m[ 1] - m[ 5];
1089
46.2k
  d[10] = m[ 2] - m[ 6];
1090
46.2k
  d[11] = m[ 3] - m[ 7];
1091
46.2k
  d[12] = m[12] - m[ 8];
1092
46.2k
  d[13] = m[13] - m[ 9];
1093
46.2k
  d[14] = m[14] - m[10];
1094
46.2k
  d[15] = m[15] - m[11];
1095
1096
46.2k
  m[ 0] = d[ 0] + d[ 3];
1097
46.2k
  m[ 1] = d[ 1] + d[ 2];
1098
46.2k
  m[ 2] = d[ 1] - d[ 2];
1099
46.2k
  m[ 3] = d[ 0] - d[ 3];
1100
46.2k
  m[ 4] = d[ 4] + d[ 7];
1101
46.2k
  m[ 5] = d[ 5] + d[ 6];
1102
46.2k
  m[ 6] = d[ 5] - d[ 6];
1103
46.2k
  m[ 7] = d[ 4] - d[ 7];
1104
46.2k
  m[ 8] = d[ 8] + d[11];
1105
46.2k
  m[ 9] = d[ 9] + d[10];
1106
46.2k
  m[10] = d[ 9] - d[10];
1107
46.2k
  m[11] = d[ 8] - d[11];
1108
46.2k
  m[12] = d[12] + d[15];
1109
46.2k
  m[13] = d[13] + d[14];
1110
46.2k
  m[14] = d[13] - d[14];
1111
46.2k
  m[15] = d[12] - d[15];
1112
1113
46.2k
  d[ 0] = m[ 0] + m[ 1];
1114
46.2k
  d[ 1] = m[ 0] - m[ 1];
1115
46.2k
  d[ 2] = m[ 2] + m[ 3];
1116
46.2k
  d[ 3] = m[ 3] - m[ 2];
1117
46.2k
  d[ 4] = m[ 4] + m[ 5];
1118
46.2k
  d[ 5] = m[ 4] - m[ 5];
1119
46.2k
  d[ 6] = m[ 6] + m[ 7];
1120
46.2k
  d[ 7] = m[ 7] - m[ 6];
1121
46.2k
  d[ 8] = m[ 8] + m[ 9];
1122
46.2k
  d[ 9] = m[ 8] - m[ 9];
1123
46.2k
  d[10] = m[10] + m[11];
1124
46.2k
  d[11] = m[11] - m[10];
1125
46.2k
  d[12] = m[12] + m[13];
1126
46.2k
  d[13] = m[12] - m[13];
1127
46.2k
  d[14] = m[14] + m[15];
1128
46.2k
  d[15] = m[15] - m[14];
1129
1130
786k
  for (k=0; k<16; ++k)
1131
740k
  {
1132
740k
    satd += abs(d[k]);
1133
740k
  }
1134
1135
46.2k
  satd -= abs( d[0] );
1136
46.2k
  satd += abs( d[0] ) >> 2;
1137
46.2k
  satd = ((satd+1)>>1);
1138
1139
46.2k
  return satd;
1140
46.2k
}
1141
1142
static Distortion xCalcHADs16x16_fast( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1143
0
{
1144
0
  int k, i, j, jj;
1145
0
  Distortion sad = 0;
1146
0
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1147
1148
0
  for( k = 0; k < 64; k += 8 )
1149
0
  {
1150
0
    diff[k+0] = ( ( piOrg[ 0] + piOrg[ 0+1] + piOrg[ 0+iStrideOrg] + piOrg[ 0+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 0] + piCur[ 0+1] + piCur[ 0+iStrideCur] + piCur[ 0+1+iStrideCur] + 2 ) >> 2 );
1151
0
    diff[k+1] = ( ( piOrg[ 2] + piOrg[ 2+1] + piOrg[ 2+iStrideOrg] + piOrg[ 2+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 2] + piCur[ 2+1] + piCur[ 2+iStrideCur] + piCur[ 2+1+iStrideCur] + 2 ) >> 2 );
1152
0
    diff[k+2] = ( ( piOrg[ 4] + piOrg[ 4+1] + piOrg[ 4+iStrideOrg] + piOrg[ 4+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 4] + piCur[ 4+1] + piCur[ 4+iStrideCur] + piCur[ 4+1+iStrideCur] + 2 ) >> 2 );
1153
0
    diff[k+3] = ( ( piOrg[ 6] + piOrg[ 6+1] + piOrg[ 6+iStrideOrg] + piOrg[ 6+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 6] + piCur[ 6+1] + piCur[ 6+iStrideCur] + piCur[ 6+1+iStrideCur] + 2 ) >> 2 );
1154
0
    diff[k+4] = ( ( piOrg[ 8] + piOrg[ 8+1] + piOrg[ 8+iStrideOrg] + piOrg[ 8+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 8] + piCur[ 8+1] + piCur[ 8+iStrideCur] + piCur[ 8+1+iStrideCur] + 2 ) >> 2 );
1155
0
    diff[k+5] = ( ( piOrg[10] + piOrg[10+1] + piOrg[10+iStrideOrg] + piOrg[10+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[10] + piCur[10+1] + piCur[10+iStrideCur] + piCur[10+1+iStrideCur] + 2 ) >> 2 );
1156
0
    diff[k+6] = ( ( piOrg[12] + piOrg[12+1] + piOrg[12+iStrideOrg] + piOrg[12+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[12] + piCur[12+1] + piCur[12+iStrideCur] + piCur[12+1+iStrideCur] + 2 ) >> 2 );
1157
0
    diff[k+7] = ( ( piOrg[14] + piOrg[14+1] + piOrg[14+iStrideOrg] + piOrg[14+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[14] + piCur[14+1] + piCur[14+iStrideCur] + piCur[14+1+iStrideCur] + 2 ) >> 2 );
1158
1159
0
    piCur += 2 * iStrideCur;
1160
0
    piOrg += 2 * iStrideOrg;
1161
0
  }
1162
1163
  //horizontal
1164
0
  for (j=0; j < 8; j++)
1165
0
  {
1166
0
    jj = j << 3;
1167
0
    m2[j][0] = diff[jj  ] + diff[jj+4];
1168
0
    m2[j][1] = diff[jj+1] + diff[jj+5];
1169
0
    m2[j][2] = diff[jj+2] + diff[jj+6];
1170
0
    m2[j][3] = diff[jj+3] + diff[jj+7];
1171
0
    m2[j][4] = diff[jj  ] - diff[jj+4];
1172
0
    m2[j][5] = diff[jj+1] - diff[jj+5];
1173
0
    m2[j][6] = diff[jj+2] - diff[jj+6];
1174
0
    m2[j][7] = diff[jj+3] - diff[jj+7];
1175
1176
0
    m1[j][0] = m2[j][0] + m2[j][2];
1177
0
    m1[j][1] = m2[j][1] + m2[j][3];
1178
0
    m1[j][2] = m2[j][0] - m2[j][2];
1179
0
    m1[j][3] = m2[j][1] - m2[j][3];
1180
0
    m1[j][4] = m2[j][4] + m2[j][6];
1181
0
    m1[j][5] = m2[j][5] + m2[j][7];
1182
0
    m1[j][6] = m2[j][4] - m2[j][6];
1183
0
    m1[j][7] = m2[j][5] - m2[j][7];
1184
1185
0
    m2[j][0] = m1[j][0] + m1[j][1];
1186
0
    m2[j][1] = m1[j][0] - m1[j][1];
1187
0
    m2[j][2] = m1[j][2] + m1[j][3];
1188
0
    m2[j][3] = m1[j][2] - m1[j][3];
1189
0
    m2[j][4] = m1[j][4] + m1[j][5];
1190
0
    m2[j][5] = m1[j][4] - m1[j][5];
1191
0
    m2[j][6] = m1[j][6] + m1[j][7];
1192
0
    m2[j][7] = m1[j][6] - m1[j][7];
1193
0
  }
1194
1195
  //vertical
1196
0
  for (i=0; i < 8; i++)
1197
0
  {
1198
0
    m3[0][i] = m2[0][i] + m2[4][i];
1199
0
    m3[1][i] = m2[1][i] + m2[5][i];
1200
0
    m3[2][i] = m2[2][i] + m2[6][i];
1201
0
    m3[3][i] = m2[3][i] + m2[7][i];
1202
0
    m3[4][i] = m2[0][i] - m2[4][i];
1203
0
    m3[5][i] = m2[1][i] - m2[5][i];
1204
0
    m3[6][i] = m2[2][i] - m2[6][i];
1205
0
    m3[7][i] = m2[3][i] - m2[7][i];
1206
1207
0
    m1[0][i] = m3[0][i] + m3[2][i];
1208
0
    m1[1][i] = m3[1][i] + m3[3][i];
1209
0
    m1[2][i] = m3[0][i] - m3[2][i];
1210
0
    m1[3][i] = m3[1][i] - m3[3][i];
1211
0
    m1[4][i] = m3[4][i] + m3[6][i];
1212
0
    m1[5][i] = m3[5][i] + m3[7][i];
1213
0
    m1[6][i] = m3[4][i] - m3[6][i];
1214
0
    m1[7][i] = m3[5][i] - m3[7][i];
1215
1216
0
    m2[0][i] = m1[0][i] + m1[1][i];
1217
0
    m2[1][i] = m1[0][i] - m1[1][i];
1218
0
    m2[2][i] = m1[2][i] + m1[3][i];
1219
0
    m2[3][i] = m1[2][i] - m1[3][i];
1220
0
    m2[4][i] = m1[4][i] + m1[5][i];
1221
0
    m2[5][i] = m1[4][i] - m1[5][i];
1222
0
    m2[6][i] = m1[6][i] + m1[7][i];
1223
0
    m2[7][i] = m1[6][i] - m1[7][i];
1224
0
  }
1225
1226
0
  for (i = 0; i < 8; i++)
1227
0
  {
1228
0
    for (j = 0; j < 8; j++)
1229
0
    {
1230
0
      sad += abs(m2[i][j]);
1231
0
    }
1232
0
  }
1233
  
1234
0
  sad -= abs( m2[0][0] );
1235
0
  sad += abs( m2[0][0] ) >> 2;
1236
0
  sad=((sad+2)>>2);
1237
1238
0
  return (sad << 2);
1239
0
}
1240
1241
static Distortion xCalcHADs8x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1242
17.4M
{
1243
17.4M
  int k, i, j, jj;
1244
17.4M
  Distortion sad = 0;
1245
17.4M
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1246
1247
157M
  for( k = 0; k < 64; k += 8 )
1248
139M
  {
1249
139M
    diff[k+0] = piOrg[0] - piCur[0];
1250
139M
    diff[k+1] = piOrg[1] - piCur[1];
1251
139M
    diff[k+2] = piOrg[2] - piCur[2];
1252
139M
    diff[k+3] = piOrg[3] - piCur[3];
1253
139M
    diff[k+4] = piOrg[4] - piCur[4];
1254
139M
    diff[k+5] = piOrg[5] - piCur[5];
1255
139M
    diff[k+6] = piOrg[6] - piCur[6];
1256
139M
    diff[k+7] = piOrg[7] - piCur[7];
1257
1258
139M
    piCur += iStrideCur;
1259
139M
    piOrg += iStrideOrg;
1260
139M
  }
1261
1262
  //horizontal
1263
157M
  for (j=0; j < 8; j++)
1264
139M
  {
1265
139M
    jj = j << 3;
1266
139M
    m2[j][0] = diff[jj  ] + diff[jj+4];
1267
139M
    m2[j][1] = diff[jj+1] + diff[jj+5];
1268
139M
    m2[j][2] = diff[jj+2] + diff[jj+6];
1269
139M
    m2[j][3] = diff[jj+3] + diff[jj+7];
1270
139M
    m2[j][4] = diff[jj  ] - diff[jj+4];
1271
139M
    m2[j][5] = diff[jj+1] - diff[jj+5];
1272
139M
    m2[j][6] = diff[jj+2] - diff[jj+6];
1273
139M
    m2[j][7] = diff[jj+3] - diff[jj+7];
1274
1275
139M
    m1[j][0] = m2[j][0] + m2[j][2];
1276
139M
    m1[j][1] = m2[j][1] + m2[j][3];
1277
139M
    m1[j][2] = m2[j][0] - m2[j][2];
1278
139M
    m1[j][3] = m2[j][1] - m2[j][3];
1279
139M
    m1[j][4] = m2[j][4] + m2[j][6];
1280
139M
    m1[j][5] = m2[j][5] + m2[j][7];
1281
139M
    m1[j][6] = m2[j][4] - m2[j][6];
1282
139M
    m1[j][7] = m2[j][5] - m2[j][7];
1283
1284
139M
    m2[j][0] = m1[j][0] + m1[j][1];
1285
139M
    m2[j][1] = m1[j][0] - m1[j][1];
1286
139M
    m2[j][2] = m1[j][2] + m1[j][3];
1287
139M
    m2[j][3] = m1[j][2] - m1[j][3];
1288
139M
    m2[j][4] = m1[j][4] + m1[j][5];
1289
139M
    m2[j][5] = m1[j][4] - m1[j][5];
1290
139M
    m2[j][6] = m1[j][6] + m1[j][7];
1291
139M
    m2[j][7] = m1[j][6] - m1[j][7];
1292
139M
  }
1293
1294
  //vertical
1295
157M
  for (i=0; i < 8; i++)
1296
139M
  {
1297
139M
    m3[0][i] = m2[0][i] + m2[4][i];
1298
139M
    m3[1][i] = m2[1][i] + m2[5][i];
1299
139M
    m3[2][i] = m2[2][i] + m2[6][i];
1300
139M
    m3[3][i] = m2[3][i] + m2[7][i];
1301
139M
    m3[4][i] = m2[0][i] - m2[4][i];
1302
139M
    m3[5][i] = m2[1][i] - m2[5][i];
1303
139M
    m3[6][i] = m2[2][i] - m2[6][i];
1304
139M
    m3[7][i] = m2[3][i] - m2[7][i];
1305
1306
139M
    m1[0][i] = m3[0][i] + m3[2][i];
1307
139M
    m1[1][i] = m3[1][i] + m3[3][i];
1308
139M
    m1[2][i] = m3[0][i] - m3[2][i];
1309
139M
    m1[3][i] = m3[1][i] - m3[3][i];
1310
139M
    m1[4][i] = m3[4][i] + m3[6][i];
1311
139M
    m1[5][i] = m3[5][i] + m3[7][i];
1312
139M
    m1[6][i] = m3[4][i] - m3[6][i];
1313
139M
    m1[7][i] = m3[5][i] - m3[7][i];
1314
1315
139M
    m2[0][i] = m1[0][i] + m1[1][i];
1316
139M
    m2[1][i] = m1[0][i] - m1[1][i];
1317
139M
    m2[2][i] = m1[2][i] + m1[3][i];
1318
139M
    m2[3][i] = m1[2][i] - m1[3][i];
1319
139M
    m2[4][i] = m1[4][i] + m1[5][i];
1320
139M
    m2[5][i] = m1[4][i] - m1[5][i];
1321
139M
    m2[6][i] = m1[6][i] + m1[7][i];
1322
139M
    m2[7][i] = m1[6][i] - m1[7][i];
1323
139M
  }
1324
1325
157M
  for (i = 0; i < 8; i++)
1326
139M
  {
1327
1.25G
    for (j = 0; j < 8; j++)
1328
1.11G
    {
1329
1.11G
      sad += abs(m2[i][j]);
1330
1.11G
    }
1331
139M
  }
1332
  
1333
17.4M
  sad -= abs( m2[0][0] );
1334
17.4M
  sad += abs( m2[0][0] ) >> 2;
1335
17.4M
  sad=((sad+2)>>2);
1336
1337
17.4M
  return sad;
1338
17.4M
}
1339
1340
static Distortion xCalcHADs16x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1341
987k
{   //need to add SIMD implementation ,JCA
1342
987k
  int k, i, j, jj, sad = 0;
1343
987k
  int diff[128], m1[8][16], m2[8][16];
1344
8.88M
  for( k = 0; k < 128; k += 16 )
1345
7.89M
  {
1346
7.89M
    diff[k + 0] = piOrg[0] - piCur[0];
1347
7.89M
    diff[k + 1] = piOrg[1] - piCur[1];
1348
7.89M
    diff[k + 2] = piOrg[2] - piCur[2];
1349
7.89M
    diff[k + 3] = piOrg[3] - piCur[3];
1350
7.89M
    diff[k + 4] = piOrg[4] - piCur[4];
1351
7.89M
    diff[k + 5] = piOrg[5] - piCur[5];
1352
7.89M
    diff[k + 6] = piOrg[6] - piCur[6];
1353
7.89M
    diff[k + 7] = piOrg[7] - piCur[7];
1354
1355
7.89M
    diff[k + 8] = piOrg[8] - piCur[8];
1356
7.89M
    diff[k + 9] = piOrg[9] - piCur[9];
1357
7.89M
    diff[k + 10] = piOrg[10] - piCur[10];
1358
7.89M
    diff[k + 11] = piOrg[11] - piCur[11];
1359
7.89M
    diff[k + 12] = piOrg[12] - piCur[12];
1360
7.89M
    diff[k + 13] = piOrg[13] - piCur[13];
1361
7.89M
    diff[k + 14] = piOrg[14] - piCur[14];
1362
7.89M
    diff[k + 15] = piOrg[15] - piCur[15];
1363
1364
7.89M
    piCur += iStrideCur;
1365
7.89M
    piOrg += iStrideOrg;
1366
7.89M
  }
1367
1368
  //horizontal
1369
8.88M
  for( j = 0; j < 8; j++ )
1370
7.89M
  {
1371
7.89M
    jj = j << 4;
1372
1373
7.89M
    m2[j][0] = diff[jj    ] + diff[jj + 8];
1374
7.89M
    m2[j][1] = diff[jj + 1] + diff[jj + 9];
1375
7.89M
    m2[j][2] = diff[jj + 2] + diff[jj + 10];
1376
7.89M
    m2[j][3] = diff[jj + 3] + diff[jj + 11];
1377
7.89M
    m2[j][4] = diff[jj + 4] + diff[jj + 12];
1378
7.89M
    m2[j][5] = diff[jj + 5] + diff[jj + 13];
1379
7.89M
    m2[j][6] = diff[jj + 6] + diff[jj + 14];
1380
7.89M
    m2[j][7] = diff[jj + 7] + diff[jj + 15];
1381
7.89M
    m2[j][8] = diff[jj    ] - diff[jj + 8];
1382
7.89M
    m2[j][9] = diff[jj + 1] - diff[jj + 9];
1383
7.89M
    m2[j][10] = diff[jj + 2] - diff[jj + 10];
1384
7.89M
    m2[j][11] = diff[jj + 3] - diff[jj + 11];
1385
7.89M
    m2[j][12] = diff[jj + 4] - diff[jj + 12];
1386
7.89M
    m2[j][13] = diff[jj + 5] - diff[jj + 13];
1387
7.89M
    m2[j][14] = diff[jj + 6] - diff[jj + 14];
1388
7.89M
    m2[j][15] = diff[jj + 7] - diff[jj + 15];
1389
1390
7.89M
    m1[j][0] = m2[j][0] + m2[j][4];
1391
7.89M
    m1[j][1] = m2[j][1] + m2[j][5];
1392
7.89M
    m1[j][2] = m2[j][2] + m2[j][6];
1393
7.89M
    m1[j][3] = m2[j][3] + m2[j][7];
1394
7.89M
    m1[j][4] = m2[j][0] - m2[j][4];
1395
7.89M
    m1[j][5] = m2[j][1] - m2[j][5];
1396
7.89M
    m1[j][6] = m2[j][2] - m2[j][6];
1397
7.89M
    m1[j][7] = m2[j][3] - m2[j][7];
1398
7.89M
    m1[j][8] = m2[j][8] + m2[j][12];
1399
7.89M
    m1[j][9] = m2[j][9] + m2[j][13];
1400
7.89M
    m1[j][10] = m2[j][10] + m2[j][14];
1401
7.89M
    m1[j][11] = m2[j][11] + m2[j][15];
1402
7.89M
    m1[j][12] = m2[j][8] - m2[j][12];
1403
7.89M
    m1[j][13] = m2[j][9] - m2[j][13];
1404
7.89M
    m1[j][14] = m2[j][10] - m2[j][14];
1405
7.89M
    m1[j][15] = m2[j][11] - m2[j][15];
1406
1407
7.89M
    m2[j][0] = m1[j][0] + m1[j][2];
1408
7.89M
    m2[j][1] = m1[j][1] + m1[j][3];
1409
7.89M
    m2[j][2] = m1[j][0] - m1[j][2];
1410
7.89M
    m2[j][3] = m1[j][1] - m1[j][3];
1411
7.89M
    m2[j][4] = m1[j][4] + m1[j][6];
1412
7.89M
    m2[j][5] = m1[j][5] + m1[j][7];
1413
7.89M
    m2[j][6] = m1[j][4] - m1[j][6];
1414
7.89M
    m2[j][7] = m1[j][5] - m1[j][7];
1415
7.89M
    m2[j][8] = m1[j][8] + m1[j][10];
1416
7.89M
    m2[j][9] = m1[j][9] + m1[j][11];
1417
7.89M
    m2[j][10] = m1[j][8] - m1[j][10];
1418
7.89M
    m2[j][11] = m1[j][9] - m1[j][11];
1419
7.89M
    m2[j][12] = m1[j][12] + m1[j][14];
1420
7.89M
    m2[j][13] = m1[j][13] + m1[j][15];
1421
7.89M
    m2[j][14] = m1[j][12] - m1[j][14];
1422
7.89M
    m2[j][15] = m1[j][13] - m1[j][15];
1423
1424
7.89M
    m1[j][0] = m2[j][0] + m2[j][1];
1425
7.89M
    m1[j][1] = m2[j][0] - m2[j][1];
1426
7.89M
    m1[j][2] = m2[j][2] + m2[j][3];
1427
7.89M
    m1[j][3] = m2[j][2] - m2[j][3];
1428
7.89M
    m1[j][4] = m2[j][4] + m2[j][5];
1429
7.89M
    m1[j][5] = m2[j][4] - m2[j][5];
1430
7.89M
    m1[j][6] = m2[j][6] + m2[j][7];
1431
7.89M
    m1[j][7] = m2[j][6] - m2[j][7];
1432
7.89M
    m1[j][8] = m2[j][8] + m2[j][9];
1433
7.89M
    m1[j][9] = m2[j][8] - m2[j][9];
1434
7.89M
    m1[j][10] = m2[j][10] + m2[j][11];
1435
7.89M
    m1[j][11] = m2[j][10] - m2[j][11];
1436
7.89M
    m1[j][12] = m2[j][12] + m2[j][13];
1437
7.89M
    m1[j][13] = m2[j][12] - m2[j][13];
1438
7.89M
    m1[j][14] = m2[j][14] + m2[j][15];
1439
7.89M
    m1[j][15] = m2[j][14] - m2[j][15];
1440
7.89M
  }
1441
1442
  //vertical
1443
16.7M
  for( i = 0; i < 16; i++ )
1444
15.7M
  {
1445
15.7M
    m2[0][i] = m1[0][i] + m1[4][i];
1446
15.7M
    m2[1][i] = m1[1][i] + m1[5][i];
1447
15.7M
    m2[2][i] = m1[2][i] + m1[6][i];
1448
15.7M
    m2[3][i] = m1[3][i] + m1[7][i];
1449
15.7M
    m2[4][i] = m1[0][i] - m1[4][i];
1450
15.7M
    m2[5][i] = m1[1][i] - m1[5][i];
1451
15.7M
    m2[6][i] = m1[2][i] - m1[6][i];
1452
15.7M
    m2[7][i] = m1[3][i] - m1[7][i];
1453
1454
15.7M
    m1[0][i] = m2[0][i] + m2[2][i];
1455
15.7M
    m1[1][i] = m2[1][i] + m2[3][i];
1456
15.7M
    m1[2][i] = m2[0][i] - m2[2][i];
1457
15.7M
    m1[3][i] = m2[1][i] - m2[3][i];
1458
15.7M
    m1[4][i] = m2[4][i] + m2[6][i];
1459
15.7M
    m1[5][i] = m2[5][i] + m2[7][i];
1460
15.7M
    m1[6][i] = m2[4][i] - m2[6][i];
1461
15.7M
    m1[7][i] = m2[5][i] - m2[7][i];
1462
1463
15.7M
    m2[0][i] = m1[0][i] + m1[1][i];
1464
15.7M
    m2[1][i] = m1[0][i] - m1[1][i];
1465
15.7M
    m2[2][i] = m1[2][i] + m1[3][i];
1466
15.7M
    m2[3][i] = m1[2][i] - m1[3][i];
1467
15.7M
    m2[4][i] = m1[4][i] + m1[5][i];
1468
15.7M
    m2[5][i] = m1[4][i] - m1[5][i];
1469
15.7M
    m2[6][i] = m1[6][i] + m1[7][i];
1470
15.7M
    m2[7][i] = m1[6][i] - m1[7][i];
1471
15.7M
  }
1472
1473
8.88M
  for( i = 0; i < 8; i++ )
1474
7.89M
  {
1475
134M
    for( j = 0; j < 16; j++ )
1476
126M
    {
1477
126M
      sad += abs( m2[i][j] );
1478
126M
    }
1479
7.89M
  }
1480
  
1481
987k
  sad -= abs( m2[0][0] );
1482
987k
  sad += abs( m2[0][0] ) >> 2;
1483
987k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1484
1485
987k
  return sad;
1486
987k
}
1487
1488
static Distortion xCalcHADs8x16( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1489
989k
{
1490
989k
  int k, i, j, jj, sad = 0;
1491
989k
  int diff[128], m1[16][8], m2[16][8];
1492
16.8M
  for( k = 0; k < 128; k += 8 )
1493
15.8M
  {
1494
15.8M
    diff[k + 0] = piOrg[0] - piCur[0];
1495
15.8M
    diff[k + 1] = piOrg[1] - piCur[1];
1496
15.8M
    diff[k + 2] = piOrg[2] - piCur[2];
1497
15.8M
    diff[k + 3] = piOrg[3] - piCur[3];
1498
15.8M
    diff[k + 4] = piOrg[4] - piCur[4];
1499
15.8M
    diff[k + 5] = piOrg[5] - piCur[5];
1500
15.8M
    diff[k + 6] = piOrg[6] - piCur[6];
1501
15.8M
    diff[k + 7] = piOrg[7] - piCur[7];
1502
1503
15.8M
    piCur += iStrideCur;
1504
15.8M
    piOrg += iStrideOrg;
1505
15.8M
  }
1506
1507
  //horizontal
1508
16.8M
  for( j = 0; j < 16; j++ )
1509
15.8M
  {
1510
15.8M
    jj = j << 3;
1511
1512
15.8M
    m2[j][0] = diff[jj] + diff[jj + 4];
1513
15.8M
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1514
15.8M
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1515
15.8M
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1516
15.8M
    m2[j][4] = diff[jj] - diff[jj + 4];
1517
15.8M
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1518
15.8M
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1519
15.8M
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1520
1521
15.8M
    m1[j][0] = m2[j][0] + m2[j][2];
1522
15.8M
    m1[j][1] = m2[j][1] + m2[j][3];
1523
15.8M
    m1[j][2] = m2[j][0] - m2[j][2];
1524
15.8M
    m1[j][3] = m2[j][1] - m2[j][3];
1525
15.8M
    m1[j][4] = m2[j][4] + m2[j][6];
1526
15.8M
    m1[j][5] = m2[j][5] + m2[j][7];
1527
15.8M
    m1[j][6] = m2[j][4] - m2[j][6];
1528
15.8M
    m1[j][7] = m2[j][5] - m2[j][7];
1529
1530
15.8M
    m2[j][0] = m1[j][0] + m1[j][1];
1531
15.8M
    m2[j][1] = m1[j][0] - m1[j][1];
1532
15.8M
    m2[j][2] = m1[j][2] + m1[j][3];
1533
15.8M
    m2[j][3] = m1[j][2] - m1[j][3];
1534
15.8M
    m2[j][4] = m1[j][4] + m1[j][5];
1535
15.8M
    m2[j][5] = m1[j][4] - m1[j][5];
1536
15.8M
    m2[j][6] = m1[j][6] + m1[j][7];
1537
15.8M
    m2[j][7] = m1[j][6] - m1[j][7];
1538
15.8M
  }
1539
1540
  //vertical
1541
8.90M
  for( i = 0; i < 8; i++ )
1542
7.91M
  {
1543
7.91M
    m1[0][i] = m2[0][i] + m2[8][i];
1544
7.91M
    m1[1][i] = m2[1][i] + m2[9][i];
1545
7.91M
    m1[2][i] = m2[2][i] + m2[10][i];
1546
7.91M
    m1[3][i] = m2[3][i] + m2[11][i];
1547
7.91M
    m1[4][i] = m2[4][i] + m2[12][i];
1548
7.91M
    m1[5][i] = m2[5][i] + m2[13][i];
1549
7.91M
    m1[6][i] = m2[6][i] + m2[14][i];
1550
7.91M
    m1[7][i] = m2[7][i] + m2[15][i];
1551
7.91M
    m1[8][i] = m2[0][i] - m2[8][i];
1552
7.91M
    m1[9][i] = m2[1][i] - m2[9][i];
1553
7.91M
    m1[10][i] = m2[2][i] - m2[10][i];
1554
7.91M
    m1[11][i] = m2[3][i] - m2[11][i];
1555
7.91M
    m1[12][i] = m2[4][i] - m2[12][i];
1556
7.91M
    m1[13][i] = m2[5][i] - m2[13][i];
1557
7.91M
    m1[14][i] = m2[6][i] - m2[14][i];
1558
7.91M
    m1[15][i] = m2[7][i] - m2[15][i];
1559
1560
7.91M
    m2[0][i] = m1[0][i] + m1[4][i];
1561
7.91M
    m2[1][i] = m1[1][i] + m1[5][i];
1562
7.91M
    m2[2][i] = m1[2][i] + m1[6][i];
1563
7.91M
    m2[3][i] = m1[3][i] + m1[7][i];
1564
7.91M
    m2[4][i] = m1[0][i] - m1[4][i];
1565
7.91M
    m2[5][i] = m1[1][i] - m1[5][i];
1566
7.91M
    m2[6][i] = m1[2][i] - m1[6][i];
1567
7.91M
    m2[7][i] = m1[3][i] - m1[7][i];
1568
7.91M
    m2[8][i] = m1[8][i] + m1[12][i];
1569
7.91M
    m2[9][i] = m1[9][i] + m1[13][i];
1570
7.91M
    m2[10][i] = m1[10][i] + m1[14][i];
1571
7.91M
    m2[11][i] = m1[11][i] + m1[15][i];
1572
7.91M
    m2[12][i] = m1[8][i] - m1[12][i];
1573
7.91M
    m2[13][i] = m1[9][i] - m1[13][i];
1574
7.91M
    m2[14][i] = m1[10][i] - m1[14][i];
1575
7.91M
    m2[15][i] = m1[11][i] - m1[15][i];
1576
1577
7.91M
    m1[0][i] = m2[0][i] + m2[2][i];
1578
7.91M
    m1[1][i] = m2[1][i] + m2[3][i];
1579
7.91M
    m1[2][i] = m2[0][i] - m2[2][i];
1580
7.91M
    m1[3][i] = m2[1][i] - m2[3][i];
1581
7.91M
    m1[4][i] = m2[4][i] + m2[6][i];
1582
7.91M
    m1[5][i] = m2[5][i] + m2[7][i];
1583
7.91M
    m1[6][i] = m2[4][i] - m2[6][i];
1584
7.91M
    m1[7][i] = m2[5][i] - m2[7][i];
1585
7.91M
    m1[8][i] = m2[8][i] + m2[10][i];
1586
7.91M
    m1[9][i] = m2[9][i] + m2[11][i];
1587
7.91M
    m1[10][i] = m2[8][i] - m2[10][i];
1588
7.91M
    m1[11][i] = m2[9][i] - m2[11][i];
1589
7.91M
    m1[12][i] = m2[12][i] + m2[14][i];
1590
7.91M
    m1[13][i] = m2[13][i] + m2[15][i];
1591
7.91M
    m1[14][i] = m2[12][i] - m2[14][i];
1592
7.91M
    m1[15][i] = m2[13][i] - m2[15][i];
1593
1594
7.91M
    m2[0][i] = m1[0][i] + m1[1][i];
1595
7.91M
    m2[1][i] = m1[0][i] - m1[1][i];
1596
7.91M
    m2[2][i] = m1[2][i] + m1[3][i];
1597
7.91M
    m2[3][i] = m1[2][i] - m1[3][i];
1598
7.91M
    m2[4][i] = m1[4][i] + m1[5][i];
1599
7.91M
    m2[5][i] = m1[4][i] - m1[5][i];
1600
7.91M
    m2[6][i] = m1[6][i] + m1[7][i];
1601
7.91M
    m2[7][i] = m1[6][i] - m1[7][i];
1602
7.91M
    m2[8][i] = m1[8][i] + m1[9][i];
1603
7.91M
    m2[9][i] = m1[8][i] - m1[9][i];
1604
7.91M
    m2[10][i] = m1[10][i] + m1[11][i];
1605
7.91M
    m2[11][i] = m1[10][i] - m1[11][i];
1606
7.91M
    m2[12][i] = m1[12][i] + m1[13][i];
1607
7.91M
    m2[13][i] = m1[12][i] - m1[13][i];
1608
7.91M
    m2[14][i] = m1[14][i] + m1[15][i];
1609
7.91M
    m2[15][i] = m1[14][i] - m1[15][i];
1610
7.91M
  }
1611
1612
16.8M
  for( i = 0; i < 16; i++ )
1613
15.8M
  {
1614
142M
    for( j = 0; j < 8; j++ )
1615
126M
    {
1616
126M
      sad += abs( m2[i][j] );
1617
126M
    }
1618
15.8M
  }
1619
  
1620
989k
  sad -= abs( m2[0][0] );
1621
989k
  sad += abs( m2[0][0] ) >> 2;
1622
989k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1623
1624
989k
  return sad;
1625
989k
}
1626
1627
static Distortion xCalcHADs4x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1628
175k
{
1629
175k
  int k, i, j, jj, sad = 0;
1630
175k
  int diff[32], m1[8][4], m2[8][4];
1631
1.57M
  for( k = 0; k < 32; k += 4 )
1632
1.40M
  {
1633
1.40M
    diff[k + 0] = piOrg[0] - piCur[0];
1634
1.40M
    diff[k + 1] = piOrg[1] - piCur[1];
1635
1.40M
    diff[k + 2] = piOrg[2] - piCur[2];
1636
1.40M
    diff[k + 3] = piOrg[3] - piCur[3];
1637
1638
1.40M
    piCur += iStrideCur;
1639
1.40M
    piOrg += iStrideOrg;
1640
1.40M
  }
1641
1642
  //horizontal
1643
1.57M
  for( j = 0; j < 8; j++ )
1644
1.40M
  {
1645
1.40M
    jj = j << 2;
1646
1.40M
    m2[j][0] = diff[jj] + diff[jj + 2];
1647
1.40M
    m2[j][1] = diff[jj + 1] + diff[jj + 3];
1648
1.40M
    m2[j][2] = diff[jj] - diff[jj + 2];
1649
1.40M
    m2[j][3] = diff[jj + 1] - diff[jj + 3];
1650
1651
1.40M
    m1[j][0] = m2[j][0] + m2[j][1];
1652
1.40M
    m1[j][1] = m2[j][0] - m2[j][1];
1653
1.40M
    m1[j][2] = m2[j][2] + m2[j][3];
1654
1.40M
    m1[j][3] = m2[j][2] - m2[j][3];
1655
1.40M
  }
1656
1657
  //vertical
1658
877k
  for( i = 0; i < 4; i++ )
1659
701k
  {
1660
701k
    m2[0][i] = m1[0][i] + m1[4][i];
1661
701k
    m2[1][i] = m1[1][i] + m1[5][i];
1662
701k
    m2[2][i] = m1[2][i] + m1[6][i];
1663
701k
    m2[3][i] = m1[3][i] + m1[7][i];
1664
701k
    m2[4][i] = m1[0][i] - m1[4][i];
1665
701k
    m2[5][i] = m1[1][i] - m1[5][i];
1666
701k
    m2[6][i] = m1[2][i] - m1[6][i];
1667
701k
    m2[7][i] = m1[3][i] - m1[7][i];
1668
1669
701k
    m1[0][i] = m2[0][i] + m2[2][i];
1670
701k
    m1[1][i] = m2[1][i] + m2[3][i];
1671
701k
    m1[2][i] = m2[0][i] - m2[2][i];
1672
701k
    m1[3][i] = m2[1][i] - m2[3][i];
1673
701k
    m1[4][i] = m2[4][i] + m2[6][i];
1674
701k
    m1[5][i] = m2[5][i] + m2[7][i];
1675
701k
    m1[6][i] = m2[4][i] - m2[6][i];
1676
701k
    m1[7][i] = m2[5][i] - m2[7][i];
1677
1678
701k
    m2[0][i] = m1[0][i] + m1[1][i];
1679
701k
    m2[1][i] = m1[0][i] - m1[1][i];
1680
701k
    m2[2][i] = m1[2][i] + m1[3][i];
1681
701k
    m2[3][i] = m1[2][i] - m1[3][i];
1682
701k
    m2[4][i] = m1[4][i] + m1[5][i];
1683
701k
    m2[5][i] = m1[4][i] - m1[5][i];
1684
701k
    m2[6][i] = m1[6][i] + m1[7][i];
1685
701k
    m2[7][i] = m1[6][i] - m1[7][i];
1686
701k
  }
1687
1688
1.57M
  for( i = 0; i < 8; i++ )
1689
1.40M
  {
1690
7.01M
    for( j = 0; j < 4; j++ )
1691
5.61M
    {
1692
5.61M
      sad += abs( m2[i][j] );
1693
5.61M
    }
1694
1.40M
  }
1695
  
1696
175k
  sad -= abs( m2[0][0] );
1697
175k
  sad += abs( m2[0][0] ) >> 2;
1698
175k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1699
1700
175k
  return sad;
1701
175k
}
1702
1703
static Distortion xCalcHADs8x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1704
181k
{
1705
181k
  int k, i, j, jj, sad = 0;
1706
181k
  int diff[32], m1[4][8], m2[4][8];
1707
908k
  for( k = 0; k < 32; k += 8 )
1708
727k
  {
1709
727k
    diff[k + 0] = piOrg[0] - piCur[0];
1710
727k
    diff[k + 1] = piOrg[1] - piCur[1];
1711
727k
    diff[k + 2] = piOrg[2] - piCur[2];
1712
727k
    diff[k + 3] = piOrg[3] - piCur[3];
1713
727k
    diff[k + 4] = piOrg[4] - piCur[4];
1714
727k
    diff[k + 5] = piOrg[5] - piCur[5];
1715
727k
    diff[k + 6] = piOrg[6] - piCur[6];
1716
727k
    diff[k + 7] = piOrg[7] - piCur[7];
1717
1718
727k
    piCur += iStrideCur;
1719
727k
    piOrg += iStrideOrg;
1720
727k
  }
1721
1722
  //horizontal
1723
908k
  for( j = 0; j < 4; j++ )
1724
727k
  {
1725
727k
    jj = j << 3;
1726
1727
727k
    m2[j][0] = diff[jj] + diff[jj + 4];
1728
727k
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1729
727k
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1730
727k
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1731
727k
    m2[j][4] = diff[jj] - diff[jj + 4];
1732
727k
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1733
727k
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1734
727k
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1735
1736
727k
    m1[j][0] = m2[j][0] + m2[j][2];
1737
727k
    m1[j][1] = m2[j][1] + m2[j][3];
1738
727k
    m1[j][2] = m2[j][0] - m2[j][2];
1739
727k
    m1[j][3] = m2[j][1] - m2[j][3];
1740
727k
    m1[j][4] = m2[j][4] + m2[j][6];
1741
727k
    m1[j][5] = m2[j][5] + m2[j][7];
1742
727k
    m1[j][6] = m2[j][4] - m2[j][6];
1743
727k
    m1[j][7] = m2[j][5] - m2[j][7];
1744
1745
727k
    m2[j][0] = m1[j][0] + m1[j][1];
1746
727k
    m2[j][1] = m1[j][0] - m1[j][1];
1747
727k
    m2[j][2] = m1[j][2] + m1[j][3];
1748
727k
    m2[j][3] = m1[j][2] - m1[j][3];
1749
727k
    m2[j][4] = m1[j][4] + m1[j][5];
1750
727k
    m2[j][5] = m1[j][4] - m1[j][5];
1751
727k
    m2[j][6] = m1[j][6] + m1[j][7];
1752
727k
    m2[j][7] = m1[j][6] - m1[j][7];
1753
727k
  }
1754
1755
  //vertical
1756
1.63M
  for( i = 0; i < 8; i++ )
1757
1.45M
  {
1758
1.45M
    m1[0][i] = m2[0][i] + m2[2][i];
1759
1.45M
    m1[1][i] = m2[1][i] + m2[3][i];
1760
1.45M
    m1[2][i] = m2[0][i] - m2[2][i];
1761
1.45M
    m1[3][i] = m2[1][i] - m2[3][i];
1762
1763
1.45M
    m2[0][i] = m1[0][i] + m1[1][i];
1764
1.45M
    m2[1][i] = m1[0][i] - m1[1][i];
1765
1.45M
    m2[2][i] = m1[2][i] + m1[3][i];
1766
1.45M
    m2[3][i] = m1[2][i] - m1[3][i];
1767
1.45M
  }
1768
1769
908k
  for( i = 0; i < 4; i++ )
1770
727k
  {
1771
6.54M
    for( j = 0; j < 8; j++ )
1772
5.81M
    {
1773
5.81M
      sad += abs( m2[i][j] );
1774
5.81M
    }
1775
727k
  }
1776
  
1777
181k
  sad -= abs( m2[0][0] );
1778
181k
  sad += abs( m2[0][0] ) >> 2;
1779
181k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1780
1781
181k
  return sad;
1782
181k
}
1783
1784
Distortion RdCost::xGetHAD2SADs( const DistParam &rcDtParam )
1785
1.01M
{
1786
1.01M
  if( rcDtParam.applyWeight )
1787
0
  {
1788
0
    THROW(" no support");
1789
0
  }
1790
1791
1.01M
  Distortion distHad = xGetHADs<false>( rcDtParam );
1792
1.01M
  Distortion distSad = 0;
1793
1.01M
  {
1794
1.01M
    CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering");
1795
1796
1.01M
    const Pel* piOrg  = rcDtParam.org.buf;
1797
1.01M
    const Pel* piCur  = rcDtParam.cur.buf;
1798
1.01M
    int  iRows        = rcDtParam.org.height>>2;
1799
1.01M
    int  iCols        = rcDtParam.org.width<<2;
1800
1801
1.01M
    Distortion uiSum = 0;
1802
1803
8.88M
    for( int y = 0; y < iRows;  y++ )
1804
7.86M
    {
1805
84.9M
      for (int n = 0; n < iCols; n+=16 )
1806
77.1M
      {
1807
77.1M
        uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
1808
77.1M
        uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
1809
77.1M
        uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
1810
77.1M
        uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
1811
77.1M
        uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
1812
77.1M
        uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
1813
77.1M
        uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
1814
77.1M
        uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
1815
77.1M
        uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
1816
77.1M
        uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
1817
77.1M
        uiSum += abs( piOrg[n+10] - piCur[n+10] );
1818
77.1M
        uiSum += abs( piOrg[n+11] - piCur[n+11] );
1819
77.1M
        uiSum += abs( piOrg[n+12] - piCur[n+12] );
1820
77.1M
        uiSum += abs( piOrg[n+13] - piCur[n+13] );
1821
77.1M
        uiSum += abs( piOrg[n+14] - piCur[n+14] );
1822
77.1M
        uiSum += abs( piOrg[n+15] - piCur[n+15] );
1823
77.1M
      }
1824
7.86M
      piOrg += iCols;
1825
7.86M
      piCur += iCols;
1826
7.86M
    }
1827
1828
1.01M
    distSad = (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1829
1.01M
  }
1830
1831
0
  return std::min( distHad, 2*distSad);
1832
1.01M
}
1833
1834
template<bool fastHad>
1835
Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
1836
1.67M
{
1837
1.67M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.67M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.67M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.67M
  const int  iRows = rcDtParam.org.height;
1844
1.67M
  const int  iCols = rcDtParam.org.width;
1845
1.67M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.67M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.67M
  int  x = 0, y = 0;
1849
1850
1.67M
  Distortion uiSum = 0;
1851
1852
1.67M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
361k
  {
1854
893k
    for( y = 0; y < iRows; y += 8 )
1855
531k
    {
1856
1.51M
      for( x = 0; x < iCols; x += 16 )
1857
987k
      {
1858
987k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
987k
      }
1860
531k
      piOrg += iStrideOrg * 8;
1861
531k
      piCur += iStrideCur * 8;
1862
531k
    }
1863
361k
  }
1864
1.31M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
359k
  {
1866
1.00M
    for( y = 0; y < iRows; y += 16 )
1867
648k
    {
1868
1.63M
      for( x = 0; x < iCols; x += 8 )
1869
989k
      {
1870
989k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
989k
      }
1872
648k
      piOrg += iStrideOrg * 16;
1873
648k
      piCur += iStrideCur * 16;
1874
648k
    }
1875
359k
  }
1876
952k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
97.6k
  {
1878
195k
    for( y = 0; y < iRows; y += 4 )
1879
97.6k
    {
1880
279k
      for( x = 0; x < iCols; x += 8 )
1881
181k
      {
1882
181k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
181k
      }
1884
97.6k
      piOrg += iStrideOrg * 4;
1885
97.6k
      piCur += iStrideCur * 4;
1886
97.6k
    }
1887
97.6k
  }
1888
854k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
91.8k
  {
1890
267k
    for( y = 0; y < iRows; y += 8 )
1891
175k
    {
1892
350k
      for( x = 0; x < iCols; x += 4 )
1893
175k
      {
1894
175k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
175k
      }
1896
175k
      piOrg += iStrideOrg * 8;
1897
175k
      piCur += iStrideCur * 8;
1898
175k
    }
1899
91.8k
  }
1900
762k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
762k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
685k
  {
1914
3.65M
    for( y = 0; y < iRows; y += 8 )
1915
2.96M
    {
1916
20.4M
      for( x = 0; x < iCols; x += 8 )
1917
17.4M
      {
1918
17.4M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
17.4M
      }
1920
2.96M
      piOrg += 8*iStrideOrg;
1921
2.96M
      piCur += 8*iStrideCur;
1922
2.96M
    }
1923
685k
  }
1924
76.9k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
46.2k
  {
1926
92.5k
    for( y = 0; y < iRows; y += 4 )
1927
46.2k
    {
1928
92.5k
      for( x = 0; x < iCols; x += 4 )
1929
46.2k
      {
1930
46.2k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
46.2k
      }
1932
46.2k
      piOrg += 4*iStrideOrg;
1933
46.2k
      piCur += 4*iStrideCur;
1934
46.2k
    }
1935
46.2k
  }
1936
30.6k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
30.6k
  {
1938
61.2k
    for( y = 0; y < iRows; y += 2 )
1939
30.6k
    {
1940
198k
      for( x = 0; x < iCols; x += 2 )
1941
168k
      {
1942
168k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
168k
      }
1944
30.6k
      piOrg += 2*iStrideOrg;
1945
30.6k
      piCur += 2*iStrideCur;
1946
30.6k
    }
1947
30.6k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.67M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.67M
}
unsigned long vvenc::RdCost::xGetHADs<false>(vvenc::DistParam const&)
Line
Count
Source
1836
1.67M
{
1837
1.67M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.67M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.67M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.67M
  const int  iRows = rcDtParam.org.height;
1844
1.67M
  const int  iCols = rcDtParam.org.width;
1845
1.67M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.67M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.67M
  int  x = 0, y = 0;
1849
1850
1.67M
  Distortion uiSum = 0;
1851
1852
1.67M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
361k
  {
1854
893k
    for( y = 0; y < iRows; y += 8 )
1855
531k
    {
1856
1.51M
      for( x = 0; x < iCols; x += 16 )
1857
987k
      {
1858
987k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
987k
      }
1860
531k
      piOrg += iStrideOrg * 8;
1861
531k
      piCur += iStrideCur * 8;
1862
531k
    }
1863
361k
  }
1864
1.31M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
359k
  {
1866
1.00M
    for( y = 0; y < iRows; y += 16 )
1867
648k
    {
1868
1.63M
      for( x = 0; x < iCols; x += 8 )
1869
989k
      {
1870
989k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
989k
      }
1872
648k
      piOrg += iStrideOrg * 16;
1873
648k
      piCur += iStrideCur * 16;
1874
648k
    }
1875
359k
  }
1876
952k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
97.6k
  {
1878
195k
    for( y = 0; y < iRows; y += 4 )
1879
97.6k
    {
1880
279k
      for( x = 0; x < iCols; x += 8 )
1881
181k
      {
1882
181k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
181k
      }
1884
97.6k
      piOrg += iStrideOrg * 4;
1885
97.6k
      piCur += iStrideCur * 4;
1886
97.6k
    }
1887
97.6k
  }
1888
854k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
91.8k
  {
1890
267k
    for( y = 0; y < iRows; y += 8 )
1891
175k
    {
1892
350k
      for( x = 0; x < iCols; x += 4 )
1893
175k
      {
1894
175k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
175k
      }
1896
175k
      piOrg += iStrideOrg * 8;
1897
175k
      piCur += iStrideCur * 8;
1898
175k
    }
1899
91.8k
  }
1900
762k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
762k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
685k
  {
1914
3.65M
    for( y = 0; y < iRows; y += 8 )
1915
2.96M
    {
1916
20.4M
      for( x = 0; x < iCols; x += 8 )
1917
17.4M
      {
1918
17.4M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
17.4M
      }
1920
2.96M
      piOrg += 8*iStrideOrg;
1921
2.96M
      piCur += 8*iStrideCur;
1922
2.96M
    }
1923
685k
  }
1924
76.9k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
46.2k
  {
1926
92.5k
    for( y = 0; y < iRows; y += 4 )
1927
46.2k
    {
1928
92.5k
      for( x = 0; x < iCols; x += 4 )
1929
46.2k
      {
1930
46.2k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
46.2k
      }
1932
46.2k
      piOrg += 4*iStrideOrg;
1933
46.2k
      piCur += 4*iStrideCur;
1934
46.2k
    }
1935
46.2k
  }
1936
30.6k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
30.6k
  {
1938
61.2k
    for( y = 0; y < iRows; y += 2 )
1939
30.6k
    {
1940
198k
      for( x = 0; x < iCols; x += 2 )
1941
168k
      {
1942
168k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
168k
      }
1944
30.6k
      piOrg += 2*iStrideOrg;
1945
30.6k
      piCur += 2*iStrideCur;
1946
30.6k
    }
1947
30.6k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.67M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.67M
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs<true>(vvenc::DistParam const&)
1955
1956
1957
void RdCost::saveUnadjustedLambda()
1958
12.9k
{
1959
12.9k
  m_dLambda_unadjusted = m_dLambda;
1960
12.9k
  m_DistScaleUnadjusted = m_DistScale;
1961
12.9k
}
1962
1963
1964
inline Distortion getWeightedMSE(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift)
1965
0
{
1966
0
  const Intermediate_Int iTemp = org - cur;
1967
0
  return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift);
1968
0
}
1969
1970
template<int csx>
1971
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights )
1972
0
{
1973
0
        int  iRows = rcDtParam.org.height;
1974
0
  const Pel* piOrg = rcDtParam.org.buf;
1975
0
  const Pel* piCur = rcDtParam.cur.buf;
1976
0
  const int  iCols = rcDtParam.org.width;
1977
0
  const int  iStrideCur = rcDtParam.cur.stride;
1978
0
  const int  iStrideOrg = rcDtParam.org.stride;
1979
0
  const Pel* piOrgLuma        = rcDtParam.orgLuma->buf;
1980
0
  const int  iStrideOrgLuma   = rcDtParam.orgLuma->stride;
1981
1982
0
  Distortion uiSum   = 0;
1983
0
  uint32_t uiShift   = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
1984
1985
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
1986
0
  const int cf =  1 - ( iCols & 1 );
1987
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
1988
0
  const ComponentID compId = rcDtParam.compID;
1989
0
  const size_t  cShiftY    = getComponentScaleY(compId, chmFmt);
1990
1991
0
  for( ; iRows != 0; iRows-- )
1992
0
  {
1993
0
    for (int n = 0; n < iCols; n+=2 )
1994
0
    {
1995
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], lumaWeights[piOrgLuma[(n   )<<csx]], uiShift );
1996
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], lumaWeights[piOrgLuma[(n+cf)<<csx]], uiShift );
1997
0
    }
1998
1999
0
    piOrg     += iStrideOrg;
2000
0
    piCur     += iStrideCur;
2001
0
    piOrgLuma += iStrideOrgLuma<<cShiftY;
2002
0
  }
2003
2004
0
  return ( uiSum >> ( 1 - cf ) );
2005
0
}
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
2006
2007
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedPTweight )
2008
0
{
2009
0
        int  iRows = rcDtParam.org.height;
2010
0
  const Pel* piOrg = rcDtParam.org.buf;
2011
0
  const Pel* piCur = rcDtParam.cur.buf;
2012
0
  const int  iCols = rcDtParam.org.width;
2013
0
  const int  iStrideCur = rcDtParam.cur.stride;
2014
0
  const int  iStrideOrg = rcDtParam.org.stride;
2015
2016
0
  Distortion uiSum   = 0;
2017
0
  uint32_t uiShift = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
2018
2019
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
2020
0
  const int cf =  1 - ( iCols & 1 );
2021
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
2022
  
2023
0
  for( ; iRows != 0; iRows-- )
2024
0
  {
2025
0
    for (int n = 0; n < iCols; n+=2 )
2026
0
    {
2027
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], fixedPTweight, uiShift );
2028
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], fixedPTweight, uiShift );
2029
0
    }
2030
0
    piOrg += iStrideOrg;
2031
0
    piCur += iStrideCur;
2032
0
  }
2033
2034
0
  return ( uiSum >> ( 1 - cf ) );
2035
0
}
2036
2037
Distortion RdCost::xGetSSE_WTD( const DistParam &rcDtParam ) const
2038
0
{
2039
0
  if( rcDtParam.applyWeight )
2040
0
  {
2041
0
    THROW("no support");
2042
0
  }
2043
2044
0
  if ((m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG) && rcDtParam.compID != COMP_Y)
2045
0
  {
2046
0
    const uint32_t fixedPTweight = ( uint32_t ) ( m_chromaWeight * ( double ) ( 1 << 16 ) );
2047
2048
0
    return m_fxdWtdPredPtr( rcDtParam, fixedPTweight );
2049
0
  }
2050
0
  else
2051
0
  {
2052
0
    return m_wtdPredPtr[getComponentScaleX(rcDtParam.compID, m_cf)]( rcDtParam, m_cf, m_reshapeLumaLevelToWeightPLUT );
2053
0
  }
2054
2055
0
  return 0;
2056
0
}
2057
2058
0
void RdCost::xGetSAD8X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2059
0
  DistParam rcDtParamTmp0 = rcDtParam;
2060
2061
0
  DistParam rcDtParamTmp1 = rcDtParam;
2062
0
  rcDtParamTmp1.org.buf += 1;
2063
0
  rcDtParamTmp1.cur.buf -= 1;
2064
2065
0
  DistParam rcDtParamTmp2 = rcDtParam;
2066
0
  rcDtParamTmp2.org.buf += 2;
2067
0
  rcDtParamTmp2.cur.buf -= 2;
2068
2069
0
  DistParam rcDtParamTmp3 = rcDtParam;
2070
0
  rcDtParamTmp3.org.buf += 3;
2071
0
  rcDtParamTmp3.cur.buf -= 3;
2072
2073
0
  DistParam rcDtParamTmp4 = rcDtParam;
2074
0
  rcDtParamTmp4.org.buf += 4;
2075
0
  rcDtParamTmp4.cur.buf -= 4;
2076
  
2077
0
  cost[0] = (RdCost::xGetSAD8(rcDtParamTmp0)) >> 1;
2078
0
  cost[1] = (RdCost::xGetSAD8(rcDtParamTmp1)) >> 1;
2079
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD8(rcDtParamTmp2)) >> 1;
2080
0
  cost[3] = (RdCost::xGetSAD8(rcDtParamTmp3)) >> 1;
2081
0
  cost[4] = (RdCost::xGetSAD8(rcDtParamTmp4)) >> 1;
2082
0
}
2083
2084
0
void RdCost::xGetSAD16X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2085
0
  DistParam rcDtParamTmp0 = rcDtParam;
2086
2087
0
  DistParam rcDtParamTmp1 = rcDtParam;
2088
0
  rcDtParamTmp1.org.buf += 1;
2089
0
  rcDtParamTmp1.cur.buf -= 1;
2090
2091
0
  DistParam rcDtParamTmp2 = rcDtParam;
2092
0
  rcDtParamTmp2.org.buf += 2;
2093
0
  rcDtParamTmp2.cur.buf -= 2;
2094
2095
0
  DistParam rcDtParamTmp3 = rcDtParam;
2096
0
  rcDtParamTmp3.org.buf += 3;
2097
0
  rcDtParamTmp3.cur.buf -= 3;
2098
2099
0
  DistParam rcDtParamTmp4 = rcDtParam;
2100
0
  rcDtParamTmp4.org.buf += 4;
2101
0
  rcDtParamTmp4.cur.buf -= 4;
2102
  
2103
0
  cost[0] = (RdCost::xGetSAD16(rcDtParamTmp0)) >> 1;
2104
0
  cost[1] = (RdCost::xGetSAD16(rcDtParamTmp1)) >> 1;
2105
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD16(rcDtParamTmp2)) >> 1;
2106
0
  cost[3] = (RdCost::xGetSAD16(rcDtParamTmp3)) >> 1;
2107
0
  cost[4] = (RdCost::xGetSAD16(rcDtParamTmp4)) >> 1;
2108
0
}
2109
2110
void RdCost::setDistParamGeo(DistParam &rcDP, const CPelBuf &org, const Pel *piRefY, int iRefStride, const Pel *mask,
2111
                          int iMaskStride, int stepX, int iMaskStride2, int bitDepth, ComponentID compID)
2112
0
{
2113
0
  rcDP.bitDepth = bitDepth;
2114
0
  rcDP.compID   = compID;
2115
2116
  // set Original & Curr Pointer / Stride
2117
0
  rcDP.org        = org;
2118
0
  rcDP.cur.buf    = piRefY;
2119
0
  rcDP.cur.stride = iRefStride;
2120
2121
  // set Mask
2122
0
  rcDP.mask        = mask;
2123
0
  rcDP.maskStride  = iMaskStride;
2124
0
  rcDP.stepX       = stepX;
2125
0
  rcDP.maskStride2 = iMaskStride2;
2126
2127
  // set Block Width / Height
2128
0
  rcDP.cur.width                     = org.width;
2129
0
  rcDP.cur.height                    = org.height;
2130
0
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
2131
2132
  // set Cost function for motion estimation with Mask
2133
0
  rcDP.distFunc = m_afpDistortFunc[0][DF_SAD_WITH_MASK];
2134
0
}
2135
2136
Distortion RdCost::xGetSADwMask(const DistParam &rcDtParam)
2137
0
{
2138
0
  const Pel *    org             = rcDtParam.org.buf;
2139
0
  const Pel *    cur             = rcDtParam.cur.buf;
2140
0
  const Pel *    mask            = rcDtParam.mask;
2141
0
  const int      cols            = rcDtParam.org.width;
2142
0
  int            rows            = rcDtParam.org.height;
2143
0
  const int      subShift        = rcDtParam.subShift;
2144
0
  const int      subStep         = (1 << subShift);
2145
0
  const int      strideCur       = rcDtParam.cur.stride * subStep;
2146
0
  const int      strideOrg       = rcDtParam.org.stride * subStep;
2147
0
  const int      strideMask      = rcDtParam.maskStride * subStep;
2148
0
  const int      stepX           = rcDtParam.stepX;
2149
0
  const int      strideMask2     = rcDtParam.maskStride2;
2150
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2151
2152
0
  Distortion sum = 0;
2153
0
  for (; rows != 0; rows -= subStep)
2154
0
  {
2155
0
    for (int n = 0; n < cols; n++)
2156
0
    {
2157
0
      sum += abs(org[n] - cur[n]) * *mask;
2158
0
      mask += stepX;
2159
0
    }
2160
0
    org += strideOrg;
2161
0
    cur += strideCur;
2162
0
    mask += strideMask;
2163
0
    mask += strideMask2;
2164
0
  }
2165
0
  sum <<= subShift;
2166
0
  return (sum >> distortionShift);
2167
0
}
2168
2169
Distortion RdCost::getBvCostMultiplePredsIBC(int x, int y, bool useIMV)
2170
1.33M
{
2171
1.33M
  return Distortion(m_dCostIBC * getBitsMultiplePredsIBC(x, y, useIMV));
2172
1.33M
}
2173
2174
static inline unsigned getIComponentBitsIBC( int val )
2175
3.65M
{
2176
3.65M
  if( !val ) return 1;
2177
2178
18.4E
  const unsigned int l2 = floorLog2( (val <= 0) ? (-val << 1) + 1 : (val << 1) );
2179
2180
1.83M
  return (l2 << 1) + 1;
2181
3.65M
}
2182
2183
unsigned int RdCost::getBitsMultiplePredsIBC(int x, int y, bool useIMV)
2184
1.33M
{
2185
1.33M
  int rmvH[2];
2186
1.33M
  int rmvV[2];
2187
1.33M
  rmvH[0] = x - m_bvPredictors[0].hor;
2188
1.33M
  rmvH[1] = x - m_bvPredictors[1].hor;
2189
2190
1.33M
  rmvV[0] = y - m_bvPredictors[0].ver;
2191
1.33M
  rmvV[1] = y - m_bvPredictors[1].ver;
2192
1.33M
  int absCand[2];
2193
1.33M
  absCand[0] = abs(rmvH[0]) + abs(rmvV[0]);
2194
1.33M
  absCand[1] = abs(rmvH[1]) + abs(rmvV[1]);
2195
2196
1.33M
  if (useIMV && x % 4 == 0 && y % 4 == 0)
2197
488k
  {
2198
488k
    int rmvHQP[2];
2199
488k
    int rmvVQP[2];
2200
2201
488k
    int imvShift = 2;
2202
488k
    int offset = 1 << (imvShift - 1);
2203
2204
488k
    rmvHQP[0] = (x >> 2) - ((m_bvPredictors[0].hor + offset) >> 2);
2205
488k
    rmvHQP[1] = (x >> 2) - ((m_bvPredictors[1].hor + offset) >> 2);
2206
488k
    rmvVQP[0] = (y >> 2) - ((m_bvPredictors[0].ver + offset) >> 2);
2207
488k
    rmvVQP[1] = (y >> 2) - ((m_bvPredictors[1].ver + offset) >> 2);
2208
2209
488k
    int absCandQP[2];
2210
488k
    absCandQP[0] = abs(rmvHQP[0]) + abs(rmvVQP[0]);
2211
488k
    absCandQP[1] = abs(rmvHQP[1]) + abs(rmvVQP[1]);
2212
488k
    unsigned int candBits0QP, candBits1QP;
2213
488k
    if (absCand[0] < absCand[1])
2214
0
    {
2215
0
      unsigned int candBits0 = getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2216
0
      if (absCandQP[0] < absCandQP[1])
2217
0
      {
2218
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2219
0
        return candBits0QP < candBits0 ? candBits0QP : candBits0;
2220
0
      }
2221
0
      else
2222
0
      {
2223
0
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2224
0
        return candBits1QP < candBits0 ? candBits1QP : candBits0;
2225
0
      }
2226
0
    }
2227
488k
    else
2228
488k
    {
2229
488k
      unsigned int candBits1 = getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2230
488k
      if (absCandQP[0] < absCandQP[1])
2231
0
      {
2232
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2233
0
        return candBits0QP < candBits1 ? candBits0QP : candBits1;
2234
0
      }
2235
488k
      else
2236
488k
      {
2237
488k
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2238
18.4E
        return candBits1QP < candBits1 ? candBits1QP : candBits1;
2239
488k
      }
2240
488k
    }
2241
488k
  }
2242
847k
  else
2243
847k
  {
2244
847k
    if (absCand[0] < absCand[1])
2245
0
    {
2246
0
      return getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2247
0
    }
2248
847k
    else
2249
847k
    {
2250
847k
      return getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2251
847k
    }
2252
847k
  }
2253
1.33M
}
2254
2255
} // namespace vvenc
2256
2257
//! \}
2258