Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/CommonLib/RdCost.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     RdCost.cpp
45
    \brief    RD cost computation class
46
*/
47
48
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP
49
50
#include "RdCost.h"
51
#include "Rom.h"
52
#include "UnitPartitioner.h"
53
#include "SearchSpaceCounter.h"
54
55
56
//! \ingroup CommonLib
57
//! \{
58
59
namespace vvenc {
60
61
62
template<int csx>
63
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights );
64
65
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedWeight );
66
67
RdCost::RdCost()
68
19.6k
  : m_afpDistortFunc{ { nullptr, }, { nullptr, } }
69
19.6k
{
70
19.6k
}
71
72
RdCost::~RdCost()
73
19.6k
{
74
19.6k
}
75
76
void RdCost::setLambda( double dLambda, const BitDepths &bitDepths )
77
16.7k
{
78
16.7k
  m_dLambda          = dLambda;
79
16.7k
  m_DistScale        = double(1<<SCALE_BITS) / m_dLambda;
80
16.7k
  m_dLambdaMotionSAD = sqrt(m_dLambda);
81
16.7k
}
82
83
84
// Initialize Function Pointer by [eDFunc]
85
void RdCost::create( bool enableOpt )
86
19.6k
{
87
19.6k
  m_signalType                 = RESHAPE_SIGNAL_NULL;
88
19.6k
  m_chromaWeight               = 1.0;
89
19.6k
  m_lumaBD                     = 10;
90
19.6k
  m_afpDistortFunc[0][DF_SSE    ] = RdCost::xGetSSE;
91
19.6k
  m_afpDistortFunc[0][DF_SSE2   ] = RdCost::xGetSSE;
92
19.6k
  m_afpDistortFunc[0][DF_SSE4   ] = RdCost::xGetSSE4;
93
19.6k
  m_afpDistortFunc[0][DF_SSE8   ] = RdCost::xGetSSE8;
94
19.6k
  m_afpDistortFunc[0][DF_SSE16  ] = RdCost::xGetSSE16;
95
19.6k
  m_afpDistortFunc[0][DF_SSE32  ] = RdCost::xGetSSE32;
96
19.6k
  m_afpDistortFunc[0][DF_SSE64  ] = RdCost::xGetSSE64;
97
19.6k
  m_afpDistortFunc[0][DF_SSE128 ] = RdCost::xGetSSE128;
98
99
19.6k
  m_afpDistortFunc[0][DF_SAD    ] = RdCost::xGetSAD;
100
19.6k
  m_afpDistortFunc[0][DF_SAD2   ] = RdCost::xGetSAD;
101
19.6k
  m_afpDistortFunc[0][DF_SAD4   ] = RdCost::xGetSAD4;
102
19.6k
  m_afpDistortFunc[0][DF_SAD8   ] = RdCost::xGetSAD8;
103
19.6k
  m_afpDistortFunc[0][DF_SAD16  ] = RdCost::xGetSAD16;
104
19.6k
  m_afpDistortFunc[0][DF_SAD32  ] = RdCost::xGetSAD32;
105
19.6k
  m_afpDistortFunc[0][DF_SAD64  ] = RdCost::xGetSAD64;
106
19.6k
  m_afpDistortFunc[0][DF_SAD128 ] = RdCost::xGetSAD128;
107
108
19.6k
  m_afpDistortFunc[0][DF_HAD    ] = RdCost::xGetHADs<false>;
109
19.6k
  m_afpDistortFunc[0][DF_HAD2   ] = RdCost::xGetHADs<false>;
110
19.6k
  m_afpDistortFunc[0][DF_HAD4   ] = RdCost::xGetHADs<false>;
111
19.6k
  m_afpDistortFunc[0][DF_HAD8   ] = RdCost::xGetHADs<false>;
112
19.6k
  m_afpDistortFunc[0][DF_HAD16  ] = RdCost::xGetHADs<false>;
113
19.6k
  m_afpDistortFunc[0][DF_HAD32  ] = RdCost::xGetHADs<false>;
114
19.6k
  m_afpDistortFunc[0][DF_HAD64  ] = RdCost::xGetHADs<false>;
115
19.6k
  m_afpDistortFunc[0][DF_HAD128 ] = RdCost::xGetHADs<false>;
116
117
19.6k
  m_afpDistortFunc[0][DF_HAD_fast    ] = RdCost::xGetHADs<true>;
118
19.6k
  m_afpDistortFunc[0][DF_HAD2_fast   ] = RdCost::xGetHADs<true>;
119
19.6k
  m_afpDistortFunc[0][DF_HAD4_fast   ] = RdCost::xGetHADs<true>;
120
19.6k
  m_afpDistortFunc[0][DF_HAD8_fast   ] = RdCost::xGetHADs<true>;
121
19.6k
  m_afpDistortFunc[0][DF_HAD16_fast  ] = RdCost::xGetHADs<true>;
122
19.6k
  m_afpDistortFunc[0][DF_HAD32_fast  ] = RdCost::xGetHADs<true>;
123
19.6k
  m_afpDistortFunc[0][DF_HAD64_fast  ] = RdCost::xGetHADs<true>;
124
19.6k
  m_afpDistortFunc[0][DF_HAD128_fast ] = RdCost::xGetHADs<true>;
125
126
  //  m_afpDistortFunc[0][DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;
127
19.6k
  m_afpDistortFunc[0][DF_HAD_2SAD ] = RdCost::xGetHAD2SADs;
128
129
19.6k
  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = RdCost::xGetSADwMask;
130
  // m_afpDistortFunc[1] can be used in any case
131
19.6k
  memcpy( m_afpDistortFunc[1], m_afpDistortFunc[0], sizeof(m_afpDistortFunc)/2);
132
133
19.6k
  m_wtdPredPtr[0] = lumaWeightedSSE_Core<0>;
134
19.6k
  m_wtdPredPtr[1] = lumaWeightedSSE_Core<1>;
135
19.6k
  m_fxdWtdPredPtr = fixWeightedSSE_Core;
136
137
19.6k
  m_afpDistortFuncX5[0] = RdCost::xGetSAD8X5;
138
19.6k
  m_afpDistortFuncX5[1] = RdCost::xGetSAD16X5;
139
140
19.6k
#if ENABLE_SIMD_OPT_DIST
141
19.6k
  if( enableOpt )
142
19.6k
  {
143
#ifdef TARGET_SIMD_X86
144
    initRdCostX86();
145
#endif
146
#ifdef TARGET_SIMD_ARM
147
    initRdCostARM();
148
#endif
149
19.6k
  }
150
19.6k
#endif
151
152
19.6k
  m_costMode      = VVENC_COST_STANDARD_LOSSY;
153
19.6k
  m_motionLambda  = 0;
154
19.6k
  m_iCostScale    = 0;
155
19.6k
}
156
157
#if ENABLE_MEASURE_SEARCH_SPACE
158
static Distortion xMeasurePredSearchSpaceInterceptor( const DistParam& dp )
159
{
160
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
161
  return dp.xDistFunc( dp );
162
}
163
164
#endif
165
void RdCost::setDistParam( DistParam &rcDP, const CPelBuf& org, const Pel* piRefY, int iRefStride, int bitDepth, ComponentID compID, int subShiftMode, int useHadamard )
166
50.4k
{
167
50.4k
  rcDP.bitDepth   = bitDepth;
168
50.4k
  rcDP.compID     = compID;
169
170
  // set Original & Curr Pointer / Stride
171
50.4k
  rcDP.org        = org;
172
173
50.4k
  rcDP.cur.buf    = piRefY;
174
50.4k
  rcDP.cur.stride = iRefStride;
175
176
  // set Block Width / Height
177
50.4k
  rcDP.cur.width    = org.width;
178
50.4k
  rcDP.cur.height   = org.height;
179
50.4k
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
180
181
50.4k
  const int base = (rcDP.bitDepth > 10 || rcDP.applyWeight) ? 1 : 0;
182
50.4k
  if( !useHadamard )
183
50.4k
  {
184
50.4k
    rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( org.width ) ];
185
50.4k
  }
186
0
  else
187
0
  {
188
0
    rcDP.distFunc = m_afpDistortFunc[base][( useHadamard == 1 ? DF_HAD : DF_HAD_fast ) + Log2( org.width ) ];
189
0
  }
190
191
  // initialize
192
50.4k
  rcDP.subShift  = 0;
193
194
50.4k
  if( subShiftMode == 1 )
195
0
  {
196
0
    if( rcDP.org.height > 8 && rcDP.org.width <= 128 )
197
0
    {
198
0
      rcDP.subShift = 1;
199
0
    }
200
0
  }
201
50.4k
  else if( subShiftMode == 2 )
202
0
  {
203
0
    if (rcDP.org.height > 8)
204
0
    {
205
0
      rcDP.subShift = 1;
206
0
    }
207
0
  }
208
209
#if ENABLE_MEASURE_SEARCH_SPACE
210
  rcDP.xDistFunc = rcDP.distFunc;
211
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
212
#endif
213
50.4k
}
214
215
216
DistParam RdCost::setDistParam( const CPelBuf& org, const CPelBuf& cur, int bitDepth, DFunc dfunc )
217
281k
{
218
281k
  int index = dfunc;
219
281k
  if( dfunc != DF_HAD && dfunc != DF_HAD_fast && dfunc != DF_HAD_2SAD )
220
115k
  {
221
115k
    index += Log2(org.width);
222
115k
  }
223
224
281k
  const int base = bitDepth > 10 ? 1:0; //TBD: check does SDA ever overflow
225
#if ENABLE_MEASURE_SEARCH_SPACE
226
  DistParam rcDP( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
227
  rcDP.xDistFunc = rcDP.distFunc;
228
  rcDP.distFunc  = xMeasurePredSearchSpaceInterceptor;
229
  return rcDP;
230
#else
231
281k
  return DistParam( org, cur, m_afpDistortFunc[base][index], bitDepth, 0, COMP_Y );
232
281k
#endif
233
281k
}
234
235
DistParam RdCost::setDistParam( const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShift, bool isDMVR )
236
0
{
237
0
  DistParam rcDP;
238
0
  rcDP.bitDepth   = bitDepth;
239
0
  rcDP.compID     = compID;
240
241
0
  rcDP.org.buf    = pOrg;
242
0
  rcDP.org.stride = iOrgStride;
243
0
  rcDP.org.width  = width;
244
0
  rcDP.org.height = height;
245
246
0
  rcDP.cur.buf    = piRefY;
247
0
  rcDP.cur.stride = iRefStride;
248
0
  rcDP.cur.width  = width;
249
0
  rcDP.cur.height = height;
250
0
  rcDP.subShift   = subShift;
251
252
  //  CHECK( useHadamard || rcDP.useMR, "only used in xDMVRCost with these default parameters (so far...)" );
253
0
  const int base = (rcDP.bitDepth > 10) ? 1 : 0;
254
255
0
  rcDP.distFunc = m_afpDistortFunc[base][ DF_SAD + Log2( width ) ];
256
  
257
0
  if( isDMVR )
258
0
  {
259
0
    rcDP.dmvrSadX5 = m_afpDistortFuncX5[Log2( width ) - 3];
260
0
  }
261
262
#if ENABLE_MEASURE_SEARCH_SPACE
263
  if( !isDMVR )
264
  {
265
    // DMVT is part of the decoder complexity
266
    rcDP.xDistFunc = rcDP.distFunc;
267
    rcDP.distFunc = xMeasurePredSearchSpaceInterceptor;
268
  }
269
270
#endif
271
0
  return rcDP;
272
0
}
273
274
Distortion RdCost::getDistPart( const CPelBuf& org, const CPelBuf& cur, int bitDepth, const ComponentID compId, DFunc eDFunc, const CPelBuf* orgLuma )
275
2.76M
{
276
2.76M
  DistParam dp( org, cur, nullptr, bitDepth, 0, compId );
277
# if ENABLE_MEASURE_SEARCH_SPACE
278
  g_searchSpaceAcc.addPrediction( dp.cur.width, dp.cur.height, toChannelType( dp.compID ) );
279
#endif
280
2.76M
  Distortion dist;
281
2.76M
  if( orgLuma )
282
0
  {
283
0
    CHECKD( eDFunc != DF_SSE_WTD, "mismatch func and parameter")
284
0
    dp.orgLuma  = orgLuma;
285
0
    dist = RdCost::xGetSSE_WTD( dp );
286
0
  }
287
2.76M
  else
288
2.76M
  {
289
2.76M
    if( ( org.width == 1 ) )
290
0
    {
291
0
      dist = xGetSSE( dp );
292
0
    }
293
2.76M
    else
294
2.76M
    {
295
2.76M
      const int base = (bitDepth > 10) ? 1 : 0;
296
2.76M
      dist = m_afpDistortFunc[base][eDFunc + Log2(org.width)](dp);
297
2.76M
    }
298
2.76M
  }
299
2.76M
  if (isChroma(compId))
300
2.32M
  {
301
2.32M
    return ((Distortion) (m_distortionWeight[ compId ] * dist));
302
2.32M
  }
303
440k
  else
304
440k
  {
305
440k
    return dist;
306
440k
  }
307
2.76M
}
308
309
// ====================================================================================================================
310
// Distortion functions
311
// ====================================================================================================================
312
313
// --------------------------------------------------------------------------------------------------------------------
314
// SAD
315
// --------------------------------------------------------------------------------------------------------------------
316
317
Distortion RdCost::xGetSAD( const DistParam& rcDtParam )
318
0
{
319
0
  if ( rcDtParam.applyWeight )
320
0
  {
321
0
    THROW(" no support");
322
0
  }
323
324
0
  const Pel* piOrg           = rcDtParam.org.buf;
325
0
  const Pel* piCur           = rcDtParam.cur.buf;
326
0
  const int  iCols           = rcDtParam.org.width;
327
0
        int  iRows           = rcDtParam.org.height;
328
0
  const int  iSubShift       = rcDtParam.subShift;
329
0
  const int  iSubStep        = ( 1 << iSubShift );
330
0
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
331
0
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
332
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
333
334
0
  Distortion uiSum = 0;
335
336
0
  for( ; iRows != 0; iRows -= iSubStep )
337
0
  {
338
0
    for (int n = 0; n < iCols; n++ )
339
0
    {
340
0
      uiSum += abs( piOrg[n] - piCur[n] );
341
0
    }
342
0
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
343
0
    {
344
0
      return ( uiSum >> distortionShift );
345
0
    }
346
0
    piOrg += iStrideOrg;
347
0
    piCur += iStrideCur;
348
0
  }
349
350
0
  uiSum <<= iSubShift;
351
0
  return ( uiSum >> distortionShift );
352
0
}
353
354
Distortion RdCost::xGetSAD4( const DistParam& rcDtParam )
355
134k
{
356
134k
  if ( rcDtParam.applyWeight )
357
0
  {
358
0
    THROW(" no support");
359
0
  }
360
361
134k
  const Pel* piOrg   = rcDtParam.org.buf;
362
134k
  const Pel* piCur   = rcDtParam.cur.buf;
363
134k
  int  iRows         = rcDtParam.org.height;
364
134k
  int  iSubShift     = rcDtParam.subShift;
365
134k
  int  iSubStep      = ( 1 << iSubShift );
366
134k
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
367
134k
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;
368
369
134k
  Distortion uiSum = 0;
370
371
1.66M
  for( ; iRows != 0; iRows -= iSubStep )
372
1.52M
  {
373
1.52M
    uiSum += abs( piOrg[0] - piCur[0] );
374
1.52M
    uiSum += abs( piOrg[1] - piCur[1] );
375
1.52M
    uiSum += abs( piOrg[2] - piCur[2] );
376
1.52M
    uiSum += abs( piOrg[3] - piCur[3] );
377
378
1.52M
    piOrg += iStrideOrg;
379
1.52M
    piCur += iStrideCur;
380
1.52M
  }
381
382
134k
  uiSum <<= iSubShift;
383
134k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
384
134k
}
385
386
Distortion RdCost::xGetSAD8( const DistParam& rcDtParam )
387
549k
{
388
549k
  if ( rcDtParam.applyWeight )
389
0
  {
390
0
    THROW(" no support");
391
0
  }
392
393
549k
  const Pel* piOrg      = rcDtParam.org.buf;
394
549k
  const Pel* piCur      = rcDtParam.cur.buf;
395
549k
  int  iRows            = rcDtParam.org.height;
396
549k
  int  iSubShift        = rcDtParam.subShift;
397
549k
  int  iSubStep         = ( 1 << iSubShift );
398
549k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
399
549k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
400
401
549k
  Distortion uiSum = 0;
402
403
11.5M
  for( ; iRows != 0; iRows-=iSubStep )
404
10.9M
  {
405
10.9M
    uiSum += abs( piOrg[0] - piCur[0] );
406
10.9M
    uiSum += abs( piOrg[1] - piCur[1] );
407
10.9M
    uiSum += abs( piOrg[2] - piCur[2] );
408
10.9M
    uiSum += abs( piOrg[3] - piCur[3] );
409
10.9M
    uiSum += abs( piOrg[4] - piCur[4] );
410
10.9M
    uiSum += abs( piOrg[5] - piCur[5] );
411
10.9M
    uiSum += abs( piOrg[6] - piCur[6] );
412
10.9M
    uiSum += abs( piOrg[7] - piCur[7] );
413
414
10.9M
    piOrg += iStrideOrg;
415
10.9M
    piCur += iStrideCur;
416
10.9M
  }
417
418
549k
  uiSum <<= iSubShift;
419
549k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
420
549k
}
421
422
Distortion RdCost::xGetSAD16( const DistParam& rcDtParam )
423
477k
{
424
477k
  if ( rcDtParam.applyWeight )
425
0
  {
426
0
    THROW(" no support");
427
0
  }
428
429
477k
  const Pel* piOrg      = rcDtParam.org.buf;
430
477k
  const Pel* piCur      = rcDtParam.cur.buf;
431
477k
  int  iRows            = rcDtParam.org.height;
432
477k
  int  iSubShift        = rcDtParam.subShift;
433
477k
  int  iSubStep         = ( 1 << iSubShift );
434
477k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
435
477k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
436
437
477k
  Distortion uiSum = 0;
438
439
10.6M
  for( ; iRows != 0; iRows -= iSubStep )
440
10.1M
  {
441
10.1M
    uiSum += abs( piOrg[0] - piCur[0] );
442
10.1M
    uiSum += abs( piOrg[1] - piCur[1] );
443
10.1M
    uiSum += abs( piOrg[2] - piCur[2] );
444
10.1M
    uiSum += abs( piOrg[3] - piCur[3] );
445
10.1M
    uiSum += abs( piOrg[4] - piCur[4] );
446
10.1M
    uiSum += abs( piOrg[5] - piCur[5] );
447
10.1M
    uiSum += abs( piOrg[6] - piCur[6] );
448
10.1M
    uiSum += abs( piOrg[7] - piCur[7] );
449
10.1M
    uiSum += abs( piOrg[8] - piCur[8] );
450
10.1M
    uiSum += abs( piOrg[9] - piCur[9] );
451
10.1M
    uiSum += abs( piOrg[10] - piCur[10] );
452
10.1M
    uiSum += abs( piOrg[11] - piCur[11] );
453
10.1M
    uiSum += abs( piOrg[12] - piCur[12] );
454
10.1M
    uiSum += abs( piOrg[13] - piCur[13] );
455
10.1M
    uiSum += abs( piOrg[14] - piCur[14] );
456
10.1M
    uiSum += abs( piOrg[15] - piCur[15] );
457
458
10.1M
    piOrg += iStrideOrg;
459
10.1M
    piCur += iStrideCur;
460
10.1M
  }
461
462
477k
  uiSum <<= iSubShift;
463
477k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
464
477k
}
465
466
467
Distortion RdCost::xGetSAD128( const DistParam &rcDtParam )
468
0
{
469
0
  const Pel* piOrg  = rcDtParam.org.buf;
470
0
  const Pel* piCur  = rcDtParam.cur.buf;
471
0
  int  iRows        = rcDtParam.org.height;
472
0
  int  iCols        = rcDtParam.org.width;
473
0
  int  iSubShift    = rcDtParam.subShift;
474
0
  int  iSubStep     = ( 1 << iSubShift );
475
0
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
476
0
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;
477
478
0
  Distortion uiSum = 0;
479
480
0
  for( ; iRows != 0; iRows-=iSubStep )
481
0
  {
482
0
    for (int n = 0; n < iCols; n+=16 )
483
0
    {
484
0
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
485
0
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
486
0
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
487
0
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
488
0
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
489
0
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
490
0
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
491
0
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
492
0
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
493
0
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
494
0
      uiSum += abs( piOrg[n+10] - piCur[n+10] );
495
0
      uiSum += abs( piOrg[n+11] - piCur[n+11] );
496
0
      uiSum += abs( piOrg[n+12] - piCur[n+12] );
497
0
      uiSum += abs( piOrg[n+13] - piCur[n+13] );
498
0
      uiSum += abs( piOrg[n+14] - piCur[n+14] );
499
0
      uiSum += abs( piOrg[n+15] - piCur[n+15] );
500
0
    }
501
0
    piOrg += iStrideOrg;
502
0
    piCur += iStrideCur;
503
0
  }
504
505
0
  uiSum <<= iSubShift;
506
0
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
507
0
}
508
509
Distortion RdCost::xGetSAD32( const DistParam &rcDtParam )
510
710k
{
511
710k
  if ( rcDtParam.applyWeight )
512
0
  {
513
0
    THROW(" no support");
514
0
  }
515
516
710k
  const Pel* piOrg      = rcDtParam.org.buf;
517
710k
  const Pel* piCur      = rcDtParam.cur.buf;
518
710k
  int  iRows            = rcDtParam.org.height;
519
710k
  int  iSubShift        = rcDtParam.subShift;
520
710k
  int  iSubStep         = ( 1 << iSubShift );
521
710k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
522
710k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
523
524
710k
  Distortion uiSum = 0;
525
526
14.0M
  for( ; iRows != 0; iRows-=iSubStep )
527
13.3M
  {
528
13.3M
    uiSum += abs( piOrg[0] - piCur[0] );
529
13.3M
    uiSum += abs( piOrg[1] - piCur[1] );
530
13.3M
    uiSum += abs( piOrg[2] - piCur[2] );
531
13.3M
    uiSum += abs( piOrg[3] - piCur[3] );
532
13.3M
    uiSum += abs( piOrg[4] - piCur[4] );
533
13.3M
    uiSum += abs( piOrg[5] - piCur[5] );
534
13.3M
    uiSum += abs( piOrg[6] - piCur[6] );
535
13.3M
    uiSum += abs( piOrg[7] - piCur[7] );
536
13.3M
    uiSum += abs( piOrg[8] - piCur[8] );
537
13.3M
    uiSum += abs( piOrg[9] - piCur[9] );
538
13.3M
    uiSum += abs( piOrg[10] - piCur[10] );
539
13.3M
    uiSum += abs( piOrg[11] - piCur[11] );
540
13.3M
    uiSum += abs( piOrg[12] - piCur[12] );
541
13.3M
    uiSum += abs( piOrg[13] - piCur[13] );
542
13.3M
    uiSum += abs( piOrg[14] - piCur[14] );
543
13.3M
    uiSum += abs( piOrg[15] - piCur[15] );
544
13.3M
    uiSum += abs( piOrg[16] - piCur[16] );
545
13.3M
    uiSum += abs( piOrg[17] - piCur[17] );
546
13.3M
    uiSum += abs( piOrg[18] - piCur[18] );
547
13.3M
    uiSum += abs( piOrg[19] - piCur[19] );
548
13.3M
    uiSum += abs( piOrg[20] - piCur[20] );
549
13.3M
    uiSum += abs( piOrg[21] - piCur[21] );
550
13.3M
    uiSum += abs( piOrg[22] - piCur[22] );
551
13.3M
    uiSum += abs( piOrg[23] - piCur[23] );
552
13.3M
    uiSum += abs( piOrg[24] - piCur[24] );
553
13.3M
    uiSum += abs( piOrg[25] - piCur[25] );
554
13.3M
    uiSum += abs( piOrg[26] - piCur[26] );
555
13.3M
    uiSum += abs( piOrg[27] - piCur[27] );
556
13.3M
    uiSum += abs( piOrg[28] - piCur[28] );
557
13.3M
    uiSum += abs( piOrg[29] - piCur[29] );
558
13.3M
    uiSum += abs( piOrg[30] - piCur[30] );
559
13.3M
    uiSum += abs( piOrg[31] - piCur[31] );
560
561
13.3M
    piOrg += iStrideOrg;
562
13.3M
    piCur += iStrideCur;
563
13.3M
  }
564
565
710k
  uiSum <<= iSubShift;
566
710k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
567
710k
}
568
569
570
Distortion RdCost::xGetSAD64( const DistParam &rcDtParam )
571
8.05k
{
572
8.05k
  if ( rcDtParam.applyWeight )
573
0
  {
574
0
    THROW(" no support");
575
0
  }
576
577
8.05k
  const Pel* piOrg      = rcDtParam.org.buf;
578
8.05k
  const Pel* piCur      = rcDtParam.cur.buf;
579
8.05k
  int  iRows            = rcDtParam.org.height;
580
8.05k
  int  iSubShift        = rcDtParam.subShift;
581
8.05k
  int  iSubStep         = ( 1 << iSubShift );
582
8.05k
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
583
8.05k
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;
584
585
8.05k
  Distortion uiSum = 0;
586
587
523k
  for( ; iRows != 0; iRows-=iSubStep )
588
515k
  {
589
515k
    uiSum += abs( piOrg[0] - piCur[0] );
590
515k
    uiSum += abs( piOrg[1] - piCur[1] );
591
515k
    uiSum += abs( piOrg[2] - piCur[2] );
592
515k
    uiSum += abs( piOrg[3] - piCur[3] );
593
515k
    uiSum += abs( piOrg[4] - piCur[4] );
594
515k
    uiSum += abs( piOrg[5] - piCur[5] );
595
515k
    uiSum += abs( piOrg[6] - piCur[6] );
596
515k
    uiSum += abs( piOrg[7] - piCur[7] );
597
515k
    uiSum += abs( piOrg[8] - piCur[8] );
598
515k
    uiSum += abs( piOrg[9] - piCur[9] );
599
515k
    uiSum += abs( piOrg[10] - piCur[10] );
600
515k
    uiSum += abs( piOrg[11] - piCur[11] );
601
515k
    uiSum += abs( piOrg[12] - piCur[12] );
602
515k
    uiSum += abs( piOrg[13] - piCur[13] );
603
515k
    uiSum += abs( piOrg[14] - piCur[14] );
604
515k
    uiSum += abs( piOrg[15] - piCur[15] );
605
515k
    uiSum += abs( piOrg[16] - piCur[16] );
606
515k
    uiSum += abs( piOrg[17] - piCur[17] );
607
515k
    uiSum += abs( piOrg[18] - piCur[18] );
608
515k
    uiSum += abs( piOrg[19] - piCur[19] );
609
515k
    uiSum += abs( piOrg[20] - piCur[20] );
610
515k
    uiSum += abs( piOrg[21] - piCur[21] );
611
515k
    uiSum += abs( piOrg[22] - piCur[22] );
612
515k
    uiSum += abs( piOrg[23] - piCur[23] );
613
515k
    uiSum += abs( piOrg[24] - piCur[24] );
614
515k
    uiSum += abs( piOrg[25] - piCur[25] );
615
515k
    uiSum += abs( piOrg[26] - piCur[26] );
616
515k
    uiSum += abs( piOrg[27] - piCur[27] );
617
515k
    uiSum += abs( piOrg[28] - piCur[28] );
618
515k
    uiSum += abs( piOrg[29] - piCur[29] );
619
515k
    uiSum += abs( piOrg[30] - piCur[30] );
620
515k
    uiSum += abs( piOrg[31] - piCur[31] );
621
515k
    uiSum += abs( piOrg[32] - piCur[32] );
622
515k
    uiSum += abs( piOrg[33] - piCur[33] );
623
515k
    uiSum += abs( piOrg[34] - piCur[34] );
624
515k
    uiSum += abs( piOrg[35] - piCur[35] );
625
515k
    uiSum += abs( piOrg[36] - piCur[36] );
626
515k
    uiSum += abs( piOrg[37] - piCur[37] );
627
515k
    uiSum += abs( piOrg[38] - piCur[38] );
628
515k
    uiSum += abs( piOrg[39] - piCur[39] );
629
515k
    uiSum += abs( piOrg[40] - piCur[40] );
630
515k
    uiSum += abs( piOrg[41] - piCur[41] );
631
515k
    uiSum += abs( piOrg[42] - piCur[42] );
632
515k
    uiSum += abs( piOrg[43] - piCur[43] );
633
515k
    uiSum += abs( piOrg[44] - piCur[44] );
634
515k
    uiSum += abs( piOrg[45] - piCur[45] );
635
515k
    uiSum += abs( piOrg[46] - piCur[46] );
636
515k
    uiSum += abs( piOrg[47] - piCur[47] );
637
515k
    uiSum += abs( piOrg[48] - piCur[48] );
638
515k
    uiSum += abs( piOrg[49] - piCur[49] );
639
515k
    uiSum += abs( piOrg[50] - piCur[50] );
640
515k
    uiSum += abs( piOrg[51] - piCur[51] );
641
515k
    uiSum += abs( piOrg[52] - piCur[52] );
642
515k
    uiSum += abs( piOrg[53] - piCur[53] );
643
515k
    uiSum += abs( piOrg[54] - piCur[54] );
644
515k
    uiSum += abs( piOrg[55] - piCur[55] );
645
515k
    uiSum += abs( piOrg[56] - piCur[56] );
646
515k
    uiSum += abs( piOrg[57] - piCur[57] );
647
515k
    uiSum += abs( piOrg[58] - piCur[58] );
648
515k
    uiSum += abs( piOrg[59] - piCur[59] );
649
515k
    uiSum += abs( piOrg[60] - piCur[60] );
650
515k
    uiSum += abs( piOrg[61] - piCur[61] );
651
515k
    uiSum += abs( piOrg[62] - piCur[62] );
652
515k
    uiSum += abs( piOrg[63] - piCur[63] );
653
654
515k
    piOrg += iStrideOrg;
655
515k
    piCur += iStrideCur;
656
515k
  }
657
658
8.05k
  uiSum <<= iSubShift;
659
8.05k
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
660
8.05k
}
661
662
663
// --------------------------------------------------------------------------------------------------------------------
664
// SSE
665
// --------------------------------------------------------------------------------------------------------------------
666
667
Distortion RdCost::xGetSSE( const DistParam &rcDtParam )
668
0
{
669
0
  if ( rcDtParam.applyWeight )
670
0
  {
671
0
    THROW(" no support");
672
0
  }
673
674
0
  const Pel* piOrg      = rcDtParam.org.buf;
675
0
  const Pel* piCur      = rcDtParam.cur.buf;
676
0
  int  iRows            = rcDtParam.org.height;
677
0
  int  iCols            = rcDtParam.org.width;
678
0
  int  iStrideCur       = rcDtParam.cur.stride;
679
0
  int  iStrideOrg       = rcDtParam.org.stride;
680
681
0
  Distortion uiSum   = 0;
682
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
683
684
0
  Intermediate_Int iTemp;
685
686
0
  for( ; iRows != 0; iRows-- )
687
0
  {
688
0
    for (int n = 0; n < iCols; n++ )
689
0
    {
690
0
      iTemp = piOrg[n  ] - piCur[n  ];
691
0
      uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
692
0
    }
693
0
    piOrg += iStrideOrg;
694
0
    piCur += iStrideCur;
695
0
  }
696
697
0
  return ( uiSum );
698
0
}
699
700
Distortion RdCost::xGetSSE4( const DistParam &rcDtParam )
701
569k
{
702
569k
  if ( rcDtParam.applyWeight )
703
0
  {
704
0
    CHECK( rcDtParam.org.width != 4, "Invalid size" );
705
0
    THROW(" no support");
706
0
  }
707
708
569k
  const Pel* piOrg   = rcDtParam.org.buf;
709
569k
  const Pel* piCur   = rcDtParam.cur.buf;
710
569k
  int  iRows         = rcDtParam.org.height;
711
569k
  int  iStrideOrg    = rcDtParam.org.stride;
712
569k
  int  iStrideCur    = rcDtParam.cur.stride;
713
714
569k
  Distortion uiSum   = 0;
715
569k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
716
717
569k
  Intermediate_Int  iTemp;
718
719
8.00M
  for( ; iRows != 0; iRows-- )
720
7.43M
  {
721
722
7.43M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
723
7.43M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
724
7.43M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
725
7.43M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
726
727
7.43M
    piOrg += iStrideOrg;
728
7.43M
    piCur += iStrideCur;
729
7.43M
  }
730
731
569k
  return ( uiSum );
732
569k
}
733
734
Distortion RdCost::xGetSSE8( const DistParam &rcDtParam )
735
864k
{
736
864k
  if ( rcDtParam.applyWeight )
737
0
  {
738
0
    CHECK( rcDtParam.org.width != 8, "Invalid size" );
739
0
    THROW(" no support");
740
0
  }
741
742
864k
  const Pel* piOrg   = rcDtParam.org.buf;
743
864k
  const Pel* piCur   = rcDtParam.cur.buf;
744
864k
  int  iRows         = rcDtParam.org.height;
745
864k
  int  iStrideOrg    = rcDtParam.org.stride;
746
864k
  int  iStrideCur    = rcDtParam.cur.stride;
747
748
864k
  Distortion uiSum   = 0;
749
864k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
750
751
864k
  Intermediate_Int  iTemp;
752
753
13.3M
  for( ; iRows != 0; iRows-- )
754
12.5M
  {
755
12.5M
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
756
12.5M
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
757
12.5M
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
758
12.5M
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
759
12.5M
    iTemp = piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
760
12.5M
    iTemp = piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
761
12.5M
    iTemp = piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
762
12.5M
    iTemp = piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
763
764
12.5M
    piOrg += iStrideOrg;
765
12.5M
    piCur += iStrideCur;
766
12.5M
  }
767
768
864k
  return ( uiSum );
769
864k
}
770
771
Distortion RdCost::xGetSSE16( const DistParam &rcDtParam )
772
708k
{
773
708k
  if ( rcDtParam.applyWeight )
774
0
  {
775
0
    CHECK( rcDtParam.org.width != 16, "Invalid size" );
776
0
    THROW(" no support");
777
0
  }
778
779
708k
  const Pel* piOrg   = rcDtParam.org.buf;
780
708k
  const Pel* piCur   = rcDtParam.cur.buf;
781
708k
  int  iRows         = rcDtParam.org.height;
782
708k
  int  iStrideOrg    = rcDtParam.org.stride;
783
708k
  int  iStrideCur    = rcDtParam.cur.stride;
784
785
708k
  Distortion uiSum   = 0;
786
708k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
787
788
708k
  Intermediate_Int  iTemp;
789
790
10.9M
  for( ; iRows != 0; iRows-- )
791
10.2M
  {
792
793
10.2M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
794
10.2M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
795
10.2M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
796
10.2M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
797
10.2M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
798
10.2M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
799
10.2M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
800
10.2M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
801
10.2M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
802
10.2M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
803
10.2M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
804
10.2M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
805
10.2M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
806
10.2M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
807
10.2M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
808
10.2M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
809
810
10.2M
    piOrg += iStrideOrg;
811
10.2M
    piCur += iStrideCur;
812
10.2M
  }
813
814
708k
  return ( uiSum );
815
708k
}
816
817
Distortion RdCost::xGetSSE128( const DistParam &rcDtParam )
818
0
{
819
0
  if ( rcDtParam.applyWeight )
820
0
  {
821
0
    THROW(" no support");
822
0
  }
823
0
  const Pel* piOrg   = rcDtParam.org.buf;
824
0
  const Pel* piCur   = rcDtParam.cur.buf;
825
0
  int  iRows         = rcDtParam.org.height;
826
0
  int  iCols         = rcDtParam.org.width;
827
0
  int  iStrideOrg    = rcDtParam.org.stride;
828
0
  int  iStrideCur    = rcDtParam.cur.stride;
829
830
0
  Distortion uiSum   = 0;
831
0
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
832
833
0
  Intermediate_Int  iTemp;
834
835
0
  for( ; iRows != 0; iRows-- )
836
0
  {
837
0
    for (int n = 0; n < iCols; n+=16 )
838
0
    {
839
840
0
      iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
841
0
      iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
842
0
      iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
843
0
      iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
844
0
      iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
845
0
      iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
846
0
      iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
847
0
      iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
848
0
      iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
849
0
      iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
850
0
      iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
851
0
      iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
852
0
      iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
853
0
      iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
854
0
      iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
855
0
      iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
856
857
0
    }
858
0
    piOrg += iStrideOrg;
859
0
    piCur += iStrideCur;
860
0
  }
861
862
0
  return ( uiSum );
863
0
}
864
865
Distortion RdCost::xGetSSE32( const DistParam &rcDtParam )
866
560k
{
867
560k
  if ( rcDtParam.applyWeight )
868
0
  {
869
0
    THROW(" no support");
870
0
  }
871
872
560k
  const Pel* piOrg   = rcDtParam.org.buf;
873
560k
  const Pel* piCur   = rcDtParam.cur.buf;
874
560k
  int  iRows         = rcDtParam.org.height;
875
560k
  int  iStrideOrg    = rcDtParam.org.stride;
876
560k
  int  iStrideCur    = rcDtParam.cur.stride;
877
878
560k
  Distortion uiSum   = 0;
879
560k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
880
881
560k
  Intermediate_Int  iTemp;
882
883
9.83M
  for( ; iRows != 0; iRows-- )
884
9.27M
  {
885
886
9.27M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
887
9.27M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
888
9.27M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
889
9.27M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
890
9.27M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
891
9.27M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
892
9.27M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
893
9.27M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
894
9.27M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
895
9.27M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
896
9.27M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
897
9.27M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
898
9.27M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
899
9.27M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
900
9.27M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
901
9.27M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
902
9.27M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
903
9.27M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
904
9.27M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
905
9.27M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
906
9.27M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
907
9.27M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
908
9.27M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
909
9.27M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
910
9.27M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
911
9.27M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
912
9.27M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
913
9.27M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
914
9.27M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
915
9.27M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
916
9.27M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
917
9.27M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
918
919
9.27M
    piOrg += iStrideOrg;
920
9.27M
    piCur += iStrideCur;
921
9.27M
  }
922
923
560k
  return ( uiSum );
924
560k
}
925
926
Distortion RdCost::xGetSSE64( const DistParam &rcDtParam )
927
64.1k
{
928
64.1k
  if ( rcDtParam.applyWeight )
929
0
  {
930
0
    THROW(" no support");
931
0
  }
932
933
64.1k
  const Pel* piOrg   = rcDtParam.org.buf;
934
64.1k
  const Pel* piCur   = rcDtParam.cur.buf;
935
64.1k
  int  iRows         = rcDtParam.org.height;
936
64.1k
  int  iStrideOrg    = rcDtParam.org.stride;
937
64.1k
  int  iStrideCur    = rcDtParam.cur.stride;
938
939
64.1k
  Distortion uiSum   = 0;
940
64.1k
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
941
942
64.1k
  Intermediate_Int  iTemp;
943
944
3.41M
  for( ; iRows != 0; iRows-- )
945
3.34M
  {
946
3.34M
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
947
3.34M
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
948
3.34M
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
949
3.34M
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
950
3.34M
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
951
3.34M
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
952
3.34M
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
953
3.34M
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
954
3.34M
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
955
3.34M
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
956
3.34M
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
957
3.34M
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
958
3.34M
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
959
3.34M
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
960
3.34M
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
961
3.34M
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
962
3.34M
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
963
3.34M
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
964
3.34M
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
965
3.34M
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
966
3.34M
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
967
3.34M
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
968
3.34M
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
969
3.34M
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
970
3.34M
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
971
3.34M
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
972
3.34M
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
973
3.34M
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
974
3.34M
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
975
3.34M
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
976
3.34M
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
977
3.34M
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
978
3.34M
    iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
979
3.34M
    iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
980
3.34M
    iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
981
3.34M
    iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
982
3.34M
    iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
983
3.34M
    iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
984
3.34M
    iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
985
3.34M
    iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
986
3.34M
    iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
987
3.34M
    iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
988
3.34M
    iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
989
3.34M
    iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
990
3.34M
    iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
991
3.34M
    iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
992
3.34M
    iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
993
3.34M
    iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
994
3.34M
    iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
995
3.34M
    iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
996
3.34M
    iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
997
3.34M
    iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
998
3.34M
    iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
999
3.34M
    iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1000
3.34M
    iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1001
3.34M
    iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1002
3.34M
    iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1003
3.34M
    iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1004
3.34M
    iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1005
3.34M
    iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1006
3.34M
    iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1007
3.34M
    iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1008
3.34M
    iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1009
3.34M
    iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
1010
1011
3.34M
    piOrg += iStrideOrg;
1012
3.34M
    piCur += iStrideCur;
1013
3.34M
  }
1014
1015
64.1k
  return ( uiSum );
1016
64.1k
}
1017
1018
// --------------------------------------------------------------------------------------------------------------------
1019
// HADAMARD with step (used in fractional search)
1020
// --------------------------------------------------------------------------------------------------------------------
1021
1022
Distortion RdCost::xCalcHADs2x2( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1023
165k
{
1024
165k
  Distortion satd = 0;
1025
165k
  TCoeff diff[4], m[4];
1026
1027
165k
  diff[0] = piOrg[0             ] - piCur[0];
1028
165k
  diff[1] = piOrg[1             ] - piCur[1];
1029
165k
  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
1030
165k
  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
1031
165k
  m[0] = diff[0] + diff[2];
1032
165k
  m[1] = diff[1] + diff[3];
1033
165k
  m[2] = diff[0] - diff[2];
1034
165k
  m[3] = diff[1] - diff[3];
1035
  
1036
165k
  satd += abs(m[0] + m[1]) >> 2;
1037
165k
  satd += abs(m[0] - m[1]);
1038
165k
  satd += abs(m[2] + m[3]);
1039
165k
  satd += abs(m[2] - m[3]);
1040
1041
165k
  return satd;
1042
165k
}
1043
1044
static Distortion xCalcHADs4x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1045
45.0k
{
1046
45.0k
  int k;
1047
45.0k
  Distortion satd = 0;
1048
45.0k
  TCoeff diff[16], m[16], d[16];
1049
1050
225k
  for( k = 0; k < 16; k+=4 )
1051
180k
  {
1052
180k
    diff[k+0] = piOrg[0] - piCur[0];
1053
180k
    diff[k+1] = piOrg[1] - piCur[1];
1054
180k
    diff[k+2] = piOrg[2] - piCur[2];
1055
180k
    diff[k+3] = piOrg[3] - piCur[3];
1056
1057
180k
    piCur += iStrideCur;
1058
180k
    piOrg += iStrideOrg;
1059
180k
  }
1060
1061
  /*===== hadamard transform =====*/
1062
45.0k
  m[ 0] = diff[ 0] + diff[12];
1063
45.0k
  m[ 1] = diff[ 1] + diff[13];
1064
45.0k
  m[ 2] = diff[ 2] + diff[14];
1065
45.0k
  m[ 3] = diff[ 3] + diff[15];
1066
45.0k
  m[ 4] = diff[ 4] + diff[ 8];
1067
45.0k
  m[ 5] = diff[ 5] + diff[ 9];
1068
45.0k
  m[ 6] = diff[ 6] + diff[10];
1069
45.0k
  m[ 7] = diff[ 7] + diff[11];
1070
45.0k
  m[ 8] = diff[ 4] - diff[ 8];
1071
45.0k
  m[ 9] = diff[ 5] - diff[ 9];
1072
45.0k
  m[10] = diff[ 6] - diff[10];
1073
45.0k
  m[11] = diff[ 7] - diff[11];
1074
45.0k
  m[12] = diff[ 0] - diff[12];
1075
45.0k
  m[13] = diff[ 1] - diff[13];
1076
45.0k
  m[14] = diff[ 2] - diff[14];
1077
45.0k
  m[15] = diff[ 3] - diff[15];
1078
1079
45.0k
  d[ 0] = m[ 0] + m[ 4];
1080
45.0k
  d[ 1] = m[ 1] + m[ 5];
1081
45.0k
  d[ 2] = m[ 2] + m[ 6];
1082
45.0k
  d[ 3] = m[ 3] + m[ 7];
1083
45.0k
  d[ 4] = m[ 8] + m[12];
1084
45.0k
  d[ 5] = m[ 9] + m[13];
1085
45.0k
  d[ 6] = m[10] + m[14];
1086
45.0k
  d[ 7] = m[11] + m[15];
1087
45.0k
  d[ 8] = m[ 0] - m[ 4];
1088
45.0k
  d[ 9] = m[ 1] - m[ 5];
1089
45.0k
  d[10] = m[ 2] - m[ 6];
1090
45.0k
  d[11] = m[ 3] - m[ 7];
1091
45.0k
  d[12] = m[12] - m[ 8];
1092
45.0k
  d[13] = m[13] - m[ 9];
1093
45.0k
  d[14] = m[14] - m[10];
1094
45.0k
  d[15] = m[15] - m[11];
1095
1096
45.0k
  m[ 0] = d[ 0] + d[ 3];
1097
45.0k
  m[ 1] = d[ 1] + d[ 2];
1098
45.0k
  m[ 2] = d[ 1] - d[ 2];
1099
45.0k
  m[ 3] = d[ 0] - d[ 3];
1100
45.0k
  m[ 4] = d[ 4] + d[ 7];
1101
45.0k
  m[ 5] = d[ 5] + d[ 6];
1102
45.0k
  m[ 6] = d[ 5] - d[ 6];
1103
45.0k
  m[ 7] = d[ 4] - d[ 7];
1104
45.0k
  m[ 8] = d[ 8] + d[11];
1105
45.0k
  m[ 9] = d[ 9] + d[10];
1106
45.0k
  m[10] = d[ 9] - d[10];
1107
45.0k
  m[11] = d[ 8] - d[11];
1108
45.0k
  m[12] = d[12] + d[15];
1109
45.0k
  m[13] = d[13] + d[14];
1110
45.0k
  m[14] = d[13] - d[14];
1111
45.0k
  m[15] = d[12] - d[15];
1112
1113
45.0k
  d[ 0] = m[ 0] + m[ 1];
1114
45.0k
  d[ 1] = m[ 0] - m[ 1];
1115
45.0k
  d[ 2] = m[ 2] + m[ 3];
1116
45.0k
  d[ 3] = m[ 3] - m[ 2];
1117
45.0k
  d[ 4] = m[ 4] + m[ 5];
1118
45.0k
  d[ 5] = m[ 4] - m[ 5];
1119
45.0k
  d[ 6] = m[ 6] + m[ 7];
1120
45.0k
  d[ 7] = m[ 7] - m[ 6];
1121
45.0k
  d[ 8] = m[ 8] + m[ 9];
1122
45.0k
  d[ 9] = m[ 8] - m[ 9];
1123
45.0k
  d[10] = m[10] + m[11];
1124
45.0k
  d[11] = m[11] - m[10];
1125
45.0k
  d[12] = m[12] + m[13];
1126
45.0k
  d[13] = m[12] - m[13];
1127
45.0k
  d[14] = m[14] + m[15];
1128
45.0k
  d[15] = m[15] - m[14];
1129
1130
765k
  for (k=0; k<16; ++k)
1131
720k
  {
1132
720k
    satd += abs(d[k]);
1133
720k
  }
1134
1135
45.0k
  satd -= abs( d[0] );
1136
45.0k
  satd += abs( d[0] ) >> 2;
1137
45.0k
  satd = ((satd+1)>>1);
1138
1139
45.0k
  return satd;
1140
45.0k
}
1141
1142
static Distortion xCalcHADs16x16_fast( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1143
0
{
1144
0
  int k, i, j, jj;
1145
0
  Distortion sad = 0;
1146
0
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1147
1148
0
  for( k = 0; k < 64; k += 8 )
1149
0
  {
1150
0
    diff[k+0] = ( ( piOrg[ 0] + piOrg[ 0+1] + piOrg[ 0+iStrideOrg] + piOrg[ 0+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 0] + piCur[ 0+1] + piCur[ 0+iStrideCur] + piCur[ 0+1+iStrideCur] + 2 ) >> 2 );
1151
0
    diff[k+1] = ( ( piOrg[ 2] + piOrg[ 2+1] + piOrg[ 2+iStrideOrg] + piOrg[ 2+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 2] + piCur[ 2+1] + piCur[ 2+iStrideCur] + piCur[ 2+1+iStrideCur] + 2 ) >> 2 );
1152
0
    diff[k+2] = ( ( piOrg[ 4] + piOrg[ 4+1] + piOrg[ 4+iStrideOrg] + piOrg[ 4+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 4] + piCur[ 4+1] + piCur[ 4+iStrideCur] + piCur[ 4+1+iStrideCur] + 2 ) >> 2 );
1153
0
    diff[k+3] = ( ( piOrg[ 6] + piOrg[ 6+1] + piOrg[ 6+iStrideOrg] + piOrg[ 6+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 6] + piCur[ 6+1] + piCur[ 6+iStrideCur] + piCur[ 6+1+iStrideCur] + 2 ) >> 2 );
1154
0
    diff[k+4] = ( ( piOrg[ 8] + piOrg[ 8+1] + piOrg[ 8+iStrideOrg] + piOrg[ 8+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[ 8] + piCur[ 8+1] + piCur[ 8+iStrideCur] + piCur[ 8+1+iStrideCur] + 2 ) >> 2 );
1155
0
    diff[k+5] = ( ( piOrg[10] + piOrg[10+1] + piOrg[10+iStrideOrg] + piOrg[10+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[10] + piCur[10+1] + piCur[10+iStrideCur] + piCur[10+1+iStrideCur] + 2 ) >> 2 );
1156
0
    diff[k+6] = ( ( piOrg[12] + piOrg[12+1] + piOrg[12+iStrideOrg] + piOrg[12+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[12] + piCur[12+1] + piCur[12+iStrideCur] + piCur[12+1+iStrideCur] + 2 ) >> 2 );
1157
0
    diff[k+7] = ( ( piOrg[14] + piOrg[14+1] + piOrg[14+iStrideOrg] + piOrg[14+1+iStrideOrg] + 2 ) >> 2 ) - ( ( piCur[14] + piCur[14+1] + piCur[14+iStrideCur] + piCur[14+1+iStrideCur] + 2 ) >> 2 );
1158
1159
0
    piCur += 2 * iStrideCur;
1160
0
    piOrg += 2 * iStrideOrg;
1161
0
  }
1162
1163
  //horizontal
1164
0
  for (j=0; j < 8; j++)
1165
0
  {
1166
0
    jj = j << 3;
1167
0
    m2[j][0] = diff[jj  ] + diff[jj+4];
1168
0
    m2[j][1] = diff[jj+1] + diff[jj+5];
1169
0
    m2[j][2] = diff[jj+2] + diff[jj+6];
1170
0
    m2[j][3] = diff[jj+3] + diff[jj+7];
1171
0
    m2[j][4] = diff[jj  ] - diff[jj+4];
1172
0
    m2[j][5] = diff[jj+1] - diff[jj+5];
1173
0
    m2[j][6] = diff[jj+2] - diff[jj+6];
1174
0
    m2[j][7] = diff[jj+3] - diff[jj+7];
1175
1176
0
    m1[j][0] = m2[j][0] + m2[j][2];
1177
0
    m1[j][1] = m2[j][1] + m2[j][3];
1178
0
    m1[j][2] = m2[j][0] - m2[j][2];
1179
0
    m1[j][3] = m2[j][1] - m2[j][3];
1180
0
    m1[j][4] = m2[j][4] + m2[j][6];
1181
0
    m1[j][5] = m2[j][5] + m2[j][7];
1182
0
    m1[j][6] = m2[j][4] - m2[j][6];
1183
0
    m1[j][7] = m2[j][5] - m2[j][7];
1184
1185
0
    m2[j][0] = m1[j][0] + m1[j][1];
1186
0
    m2[j][1] = m1[j][0] - m1[j][1];
1187
0
    m2[j][2] = m1[j][2] + m1[j][3];
1188
0
    m2[j][3] = m1[j][2] - m1[j][3];
1189
0
    m2[j][4] = m1[j][4] + m1[j][5];
1190
0
    m2[j][5] = m1[j][4] - m1[j][5];
1191
0
    m2[j][6] = m1[j][6] + m1[j][7];
1192
0
    m2[j][7] = m1[j][6] - m1[j][7];
1193
0
  }
1194
1195
  //vertical
1196
0
  for (i=0; i < 8; i++)
1197
0
  {
1198
0
    m3[0][i] = m2[0][i] + m2[4][i];
1199
0
    m3[1][i] = m2[1][i] + m2[5][i];
1200
0
    m3[2][i] = m2[2][i] + m2[6][i];
1201
0
    m3[3][i] = m2[3][i] + m2[7][i];
1202
0
    m3[4][i] = m2[0][i] - m2[4][i];
1203
0
    m3[5][i] = m2[1][i] - m2[5][i];
1204
0
    m3[6][i] = m2[2][i] - m2[6][i];
1205
0
    m3[7][i] = m2[3][i] - m2[7][i];
1206
1207
0
    m1[0][i] = m3[0][i] + m3[2][i];
1208
0
    m1[1][i] = m3[1][i] + m3[3][i];
1209
0
    m1[2][i] = m3[0][i] - m3[2][i];
1210
0
    m1[3][i] = m3[1][i] - m3[3][i];
1211
0
    m1[4][i] = m3[4][i] + m3[6][i];
1212
0
    m1[5][i] = m3[5][i] + m3[7][i];
1213
0
    m1[6][i] = m3[4][i] - m3[6][i];
1214
0
    m1[7][i] = m3[5][i] - m3[7][i];
1215
1216
0
    m2[0][i] = m1[0][i] + m1[1][i];
1217
0
    m2[1][i] = m1[0][i] - m1[1][i];
1218
0
    m2[2][i] = m1[2][i] + m1[3][i];
1219
0
    m2[3][i] = m1[2][i] - m1[3][i];
1220
0
    m2[4][i] = m1[4][i] + m1[5][i];
1221
0
    m2[5][i] = m1[4][i] - m1[5][i];
1222
0
    m2[6][i] = m1[6][i] + m1[7][i];
1223
0
    m2[7][i] = m1[6][i] - m1[7][i];
1224
0
  }
1225
1226
0
  for (i = 0; i < 8; i++)
1227
0
  {
1228
0
    for (j = 0; j < 8; j++)
1229
0
    {
1230
0
      sad += abs(m2[i][j]);
1231
0
    }
1232
0
  }
1233
  
1234
0
  sad -= abs( m2[0][0] );
1235
0
  sad += abs( m2[0][0] ) >> 2;
1236
0
  sad=((sad+2)>>2);
1237
1238
0
  return (sad << 2);
1239
0
}
1240
1241
static Distortion xCalcHADs8x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1242
15.4M
{
1243
15.4M
  int k, i, j, jj;
1244
15.4M
  Distortion sad = 0;
1245
15.4M
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1246
1247
139M
  for( k = 0; k < 64; k += 8 )
1248
123M
  {
1249
123M
    diff[k+0] = piOrg[0] - piCur[0];
1250
123M
    diff[k+1] = piOrg[1] - piCur[1];
1251
123M
    diff[k+2] = piOrg[2] - piCur[2];
1252
123M
    diff[k+3] = piOrg[3] - piCur[3];
1253
123M
    diff[k+4] = piOrg[4] - piCur[4];
1254
123M
    diff[k+5] = piOrg[5] - piCur[5];
1255
123M
    diff[k+6] = piOrg[6] - piCur[6];
1256
123M
    diff[k+7] = piOrg[7] - piCur[7];
1257
1258
123M
    piCur += iStrideCur;
1259
123M
    piOrg += iStrideOrg;
1260
123M
  }
1261
1262
  //horizontal
1263
139M
  for (j=0; j < 8; j++)
1264
123M
  {
1265
123M
    jj = j << 3;
1266
123M
    m2[j][0] = diff[jj  ] + diff[jj+4];
1267
123M
    m2[j][1] = diff[jj+1] + diff[jj+5];
1268
123M
    m2[j][2] = diff[jj+2] + diff[jj+6];
1269
123M
    m2[j][3] = diff[jj+3] + diff[jj+7];
1270
123M
    m2[j][4] = diff[jj  ] - diff[jj+4];
1271
123M
    m2[j][5] = diff[jj+1] - diff[jj+5];
1272
123M
    m2[j][6] = diff[jj+2] - diff[jj+6];
1273
123M
    m2[j][7] = diff[jj+3] - diff[jj+7];
1274
1275
123M
    m1[j][0] = m2[j][0] + m2[j][2];
1276
123M
    m1[j][1] = m2[j][1] + m2[j][3];
1277
123M
    m1[j][2] = m2[j][0] - m2[j][2];
1278
123M
    m1[j][3] = m2[j][1] - m2[j][3];
1279
123M
    m1[j][4] = m2[j][4] + m2[j][6];
1280
123M
    m1[j][5] = m2[j][5] + m2[j][7];
1281
123M
    m1[j][6] = m2[j][4] - m2[j][6];
1282
123M
    m1[j][7] = m2[j][5] - m2[j][7];
1283
1284
123M
    m2[j][0] = m1[j][0] + m1[j][1];
1285
123M
    m2[j][1] = m1[j][0] - m1[j][1];
1286
123M
    m2[j][2] = m1[j][2] + m1[j][3];
1287
123M
    m2[j][3] = m1[j][2] - m1[j][3];
1288
123M
    m2[j][4] = m1[j][4] + m1[j][5];
1289
123M
    m2[j][5] = m1[j][4] - m1[j][5];
1290
123M
    m2[j][6] = m1[j][6] + m1[j][7];
1291
123M
    m2[j][7] = m1[j][6] - m1[j][7];
1292
123M
  }
1293
1294
  //vertical
1295
139M
  for (i=0; i < 8; i++)
1296
123M
  {
1297
123M
    m3[0][i] = m2[0][i] + m2[4][i];
1298
123M
    m3[1][i] = m2[1][i] + m2[5][i];
1299
123M
    m3[2][i] = m2[2][i] + m2[6][i];
1300
123M
    m3[3][i] = m2[3][i] + m2[7][i];
1301
123M
    m3[4][i] = m2[0][i] - m2[4][i];
1302
123M
    m3[5][i] = m2[1][i] - m2[5][i];
1303
123M
    m3[6][i] = m2[2][i] - m2[6][i];
1304
123M
    m3[7][i] = m2[3][i] - m2[7][i];
1305
1306
123M
    m1[0][i] = m3[0][i] + m3[2][i];
1307
123M
    m1[1][i] = m3[1][i] + m3[3][i];
1308
123M
    m1[2][i] = m3[0][i] - m3[2][i];
1309
123M
    m1[3][i] = m3[1][i] - m3[3][i];
1310
123M
    m1[4][i] = m3[4][i] + m3[6][i];
1311
123M
    m1[5][i] = m3[5][i] + m3[7][i];
1312
123M
    m1[6][i] = m3[4][i] - m3[6][i];
1313
123M
    m1[7][i] = m3[5][i] - m3[7][i];
1314
1315
123M
    m2[0][i] = m1[0][i] + m1[1][i];
1316
123M
    m2[1][i] = m1[0][i] - m1[1][i];
1317
123M
    m2[2][i] = m1[2][i] + m1[3][i];
1318
123M
    m2[3][i] = m1[2][i] - m1[3][i];
1319
123M
    m2[4][i] = m1[4][i] + m1[5][i];
1320
123M
    m2[5][i] = m1[4][i] - m1[5][i];
1321
123M
    m2[6][i] = m1[6][i] + m1[7][i];
1322
123M
    m2[7][i] = m1[6][i] - m1[7][i];
1323
123M
  }
1324
1325
139M
  for (i = 0; i < 8; i++)
1326
123M
  {
1327
1.11G
    for (j = 0; j < 8; j++)
1328
991M
    {
1329
991M
      sad += abs(m2[i][j]);
1330
991M
    }
1331
123M
  }
1332
  
1333
15.4M
  sad -= abs( m2[0][0] );
1334
15.4M
  sad += abs( m2[0][0] ) >> 2;
1335
15.4M
  sad=((sad+2)>>2);
1336
1337
15.4M
  return sad;
1338
15.4M
}
1339
1340
static Distortion xCalcHADs16x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1341
928k
{   //need to add SIMD implementation ,JCA
1342
928k
  int k, i, j, jj, sad = 0;
1343
928k
  int diff[128], m1[8][16], m2[8][16];
1344
8.35M
  for( k = 0; k < 128; k += 16 )
1345
7.42M
  {
1346
7.42M
    diff[k + 0] = piOrg[0] - piCur[0];
1347
7.42M
    diff[k + 1] = piOrg[1] - piCur[1];
1348
7.42M
    diff[k + 2] = piOrg[2] - piCur[2];
1349
7.42M
    diff[k + 3] = piOrg[3] - piCur[3];
1350
7.42M
    diff[k + 4] = piOrg[4] - piCur[4];
1351
7.42M
    diff[k + 5] = piOrg[5] - piCur[5];
1352
7.42M
    diff[k + 6] = piOrg[6] - piCur[6];
1353
7.42M
    diff[k + 7] = piOrg[7] - piCur[7];
1354
1355
7.42M
    diff[k + 8] = piOrg[8] - piCur[8];
1356
7.42M
    diff[k + 9] = piOrg[9] - piCur[9];
1357
7.42M
    diff[k + 10] = piOrg[10] - piCur[10];
1358
7.42M
    diff[k + 11] = piOrg[11] - piCur[11];
1359
7.42M
    diff[k + 12] = piOrg[12] - piCur[12];
1360
7.42M
    diff[k + 13] = piOrg[13] - piCur[13];
1361
7.42M
    diff[k + 14] = piOrg[14] - piCur[14];
1362
7.42M
    diff[k + 15] = piOrg[15] - piCur[15];
1363
1364
7.42M
    piCur += iStrideCur;
1365
7.42M
    piOrg += iStrideOrg;
1366
7.42M
  }
1367
1368
  //horizontal
1369
8.35M
  for( j = 0; j < 8; j++ )
1370
7.42M
  {
1371
7.42M
    jj = j << 4;
1372
1373
7.42M
    m2[j][0] = diff[jj    ] + diff[jj + 8];
1374
7.42M
    m2[j][1] = diff[jj + 1] + diff[jj + 9];
1375
7.42M
    m2[j][2] = diff[jj + 2] + diff[jj + 10];
1376
7.42M
    m2[j][3] = diff[jj + 3] + diff[jj + 11];
1377
7.42M
    m2[j][4] = diff[jj + 4] + diff[jj + 12];
1378
7.42M
    m2[j][5] = diff[jj + 5] + diff[jj + 13];
1379
7.42M
    m2[j][6] = diff[jj + 6] + diff[jj + 14];
1380
7.42M
    m2[j][7] = diff[jj + 7] + diff[jj + 15];
1381
7.42M
    m2[j][8] = diff[jj    ] - diff[jj + 8];
1382
7.42M
    m2[j][9] = diff[jj + 1] - diff[jj + 9];
1383
7.42M
    m2[j][10] = diff[jj + 2] - diff[jj + 10];
1384
7.42M
    m2[j][11] = diff[jj + 3] - diff[jj + 11];
1385
7.42M
    m2[j][12] = diff[jj + 4] - diff[jj + 12];
1386
7.42M
    m2[j][13] = diff[jj + 5] - diff[jj + 13];
1387
7.42M
    m2[j][14] = diff[jj + 6] - diff[jj + 14];
1388
7.42M
    m2[j][15] = diff[jj + 7] - diff[jj + 15];
1389
1390
7.42M
    m1[j][0] = m2[j][0] + m2[j][4];
1391
7.42M
    m1[j][1] = m2[j][1] + m2[j][5];
1392
7.42M
    m1[j][2] = m2[j][2] + m2[j][6];
1393
7.42M
    m1[j][3] = m2[j][3] + m2[j][7];
1394
7.42M
    m1[j][4] = m2[j][0] - m2[j][4];
1395
7.42M
    m1[j][5] = m2[j][1] - m2[j][5];
1396
7.42M
    m1[j][6] = m2[j][2] - m2[j][6];
1397
7.42M
    m1[j][7] = m2[j][3] - m2[j][7];
1398
7.42M
    m1[j][8] = m2[j][8] + m2[j][12];
1399
7.42M
    m1[j][9] = m2[j][9] + m2[j][13];
1400
7.42M
    m1[j][10] = m2[j][10] + m2[j][14];
1401
7.42M
    m1[j][11] = m2[j][11] + m2[j][15];
1402
7.42M
    m1[j][12] = m2[j][8] - m2[j][12];
1403
7.42M
    m1[j][13] = m2[j][9] - m2[j][13];
1404
7.42M
    m1[j][14] = m2[j][10] - m2[j][14];
1405
7.42M
    m1[j][15] = m2[j][11] - m2[j][15];
1406
1407
7.42M
    m2[j][0] = m1[j][0] + m1[j][2];
1408
7.42M
    m2[j][1] = m1[j][1] + m1[j][3];
1409
7.42M
    m2[j][2] = m1[j][0] - m1[j][2];
1410
7.42M
    m2[j][3] = m1[j][1] - m1[j][3];
1411
7.42M
    m2[j][4] = m1[j][4] + m1[j][6];
1412
7.42M
    m2[j][5] = m1[j][5] + m1[j][7];
1413
7.42M
    m2[j][6] = m1[j][4] - m1[j][6];
1414
7.42M
    m2[j][7] = m1[j][5] - m1[j][7];
1415
7.42M
    m2[j][8] = m1[j][8] + m1[j][10];
1416
7.42M
    m2[j][9] = m1[j][9] + m1[j][11];
1417
7.42M
    m2[j][10] = m1[j][8] - m1[j][10];
1418
7.42M
    m2[j][11] = m1[j][9] - m1[j][11];
1419
7.42M
    m2[j][12] = m1[j][12] + m1[j][14];
1420
7.42M
    m2[j][13] = m1[j][13] + m1[j][15];
1421
7.42M
    m2[j][14] = m1[j][12] - m1[j][14];
1422
7.42M
    m2[j][15] = m1[j][13] - m1[j][15];
1423
1424
7.42M
    m1[j][0] = m2[j][0] + m2[j][1];
1425
7.42M
    m1[j][1] = m2[j][0] - m2[j][1];
1426
7.42M
    m1[j][2] = m2[j][2] + m2[j][3];
1427
7.42M
    m1[j][3] = m2[j][2] - m2[j][3];
1428
7.42M
    m1[j][4] = m2[j][4] + m2[j][5];
1429
7.42M
    m1[j][5] = m2[j][4] - m2[j][5];
1430
7.42M
    m1[j][6] = m2[j][6] + m2[j][7];
1431
7.42M
    m1[j][7] = m2[j][6] - m2[j][7];
1432
7.42M
    m1[j][8] = m2[j][8] + m2[j][9];
1433
7.42M
    m1[j][9] = m2[j][8] - m2[j][9];
1434
7.42M
    m1[j][10] = m2[j][10] + m2[j][11];
1435
7.42M
    m1[j][11] = m2[j][10] - m2[j][11];
1436
7.42M
    m1[j][12] = m2[j][12] + m2[j][13];
1437
7.42M
    m1[j][13] = m2[j][12] - m2[j][13];
1438
7.42M
    m1[j][14] = m2[j][14] + m2[j][15];
1439
7.42M
    m1[j][15] = m2[j][14] - m2[j][15];
1440
7.42M
  }
1441
1442
  //vertical
1443
15.7M
  for( i = 0; i < 16; i++ )
1444
14.8M
  {
1445
14.8M
    m2[0][i] = m1[0][i] + m1[4][i];
1446
14.8M
    m2[1][i] = m1[1][i] + m1[5][i];
1447
14.8M
    m2[2][i] = m1[2][i] + m1[6][i];
1448
14.8M
    m2[3][i] = m1[3][i] + m1[7][i];
1449
14.8M
    m2[4][i] = m1[0][i] - m1[4][i];
1450
14.8M
    m2[5][i] = m1[1][i] - m1[5][i];
1451
14.8M
    m2[6][i] = m1[2][i] - m1[6][i];
1452
14.8M
    m2[7][i] = m1[3][i] - m1[7][i];
1453
1454
14.8M
    m1[0][i] = m2[0][i] + m2[2][i];
1455
14.8M
    m1[1][i] = m2[1][i] + m2[3][i];
1456
14.8M
    m1[2][i] = m2[0][i] - m2[2][i];
1457
14.8M
    m1[3][i] = m2[1][i] - m2[3][i];
1458
14.8M
    m1[4][i] = m2[4][i] + m2[6][i];
1459
14.8M
    m1[5][i] = m2[5][i] + m2[7][i];
1460
14.8M
    m1[6][i] = m2[4][i] - m2[6][i];
1461
14.8M
    m1[7][i] = m2[5][i] - m2[7][i];
1462
1463
14.8M
    m2[0][i] = m1[0][i] + m1[1][i];
1464
14.8M
    m2[1][i] = m1[0][i] - m1[1][i];
1465
14.8M
    m2[2][i] = m1[2][i] + m1[3][i];
1466
14.8M
    m2[3][i] = m1[2][i] - m1[3][i];
1467
14.8M
    m2[4][i] = m1[4][i] + m1[5][i];
1468
14.8M
    m2[5][i] = m1[4][i] - m1[5][i];
1469
14.8M
    m2[6][i] = m1[6][i] + m1[7][i];
1470
14.8M
    m2[7][i] = m1[6][i] - m1[7][i];
1471
14.8M
  }
1472
1473
8.35M
  for( i = 0; i < 8; i++ )
1474
7.42M
  {
1475
126M
    for( j = 0; j < 16; j++ )
1476
118M
    {
1477
118M
      sad += abs( m2[i][j] );
1478
118M
    }
1479
7.42M
  }
1480
  
1481
928k
  sad -= abs( m2[0][0] );
1482
928k
  sad += abs( m2[0][0] ) >> 2;
1483
928k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1484
1485
928k
  return sad;
1486
928k
}
1487
1488
static Distortion xCalcHADs8x16( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1489
931k
{
1490
931k
  int k, i, j, jj, sad = 0;
1491
931k
  int diff[128], m1[16][8], m2[16][8];
1492
15.8M
  for( k = 0; k < 128; k += 8 )
1493
14.9M
  {
1494
14.9M
    diff[k + 0] = piOrg[0] - piCur[0];
1495
14.9M
    diff[k + 1] = piOrg[1] - piCur[1];
1496
14.9M
    diff[k + 2] = piOrg[2] - piCur[2];
1497
14.9M
    diff[k + 3] = piOrg[3] - piCur[3];
1498
14.9M
    diff[k + 4] = piOrg[4] - piCur[4];
1499
14.9M
    diff[k + 5] = piOrg[5] - piCur[5];
1500
14.9M
    diff[k + 6] = piOrg[6] - piCur[6];
1501
14.9M
    diff[k + 7] = piOrg[7] - piCur[7];
1502
1503
14.9M
    piCur += iStrideCur;
1504
14.9M
    piOrg += iStrideOrg;
1505
14.9M
  }
1506
1507
  //horizontal
1508
15.8M
  for( j = 0; j < 16; j++ )
1509
14.9M
  {
1510
14.9M
    jj = j << 3;
1511
1512
14.9M
    m2[j][0] = diff[jj] + diff[jj + 4];
1513
14.9M
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1514
14.9M
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1515
14.9M
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1516
14.9M
    m2[j][4] = diff[jj] - diff[jj + 4];
1517
14.9M
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1518
14.9M
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1519
14.9M
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1520
1521
14.9M
    m1[j][0] = m2[j][0] + m2[j][2];
1522
14.9M
    m1[j][1] = m2[j][1] + m2[j][3];
1523
14.9M
    m1[j][2] = m2[j][0] - m2[j][2];
1524
14.9M
    m1[j][3] = m2[j][1] - m2[j][3];
1525
14.9M
    m1[j][4] = m2[j][4] + m2[j][6];
1526
14.9M
    m1[j][5] = m2[j][5] + m2[j][7];
1527
14.9M
    m1[j][6] = m2[j][4] - m2[j][6];
1528
14.9M
    m1[j][7] = m2[j][5] - m2[j][7];
1529
1530
14.9M
    m2[j][0] = m1[j][0] + m1[j][1];
1531
14.9M
    m2[j][1] = m1[j][0] - m1[j][1];
1532
14.9M
    m2[j][2] = m1[j][2] + m1[j][3];
1533
14.9M
    m2[j][3] = m1[j][2] - m1[j][3];
1534
14.9M
    m2[j][4] = m1[j][4] + m1[j][5];
1535
14.9M
    m2[j][5] = m1[j][4] - m1[j][5];
1536
14.9M
    m2[j][6] = m1[j][6] + m1[j][7];
1537
14.9M
    m2[j][7] = m1[j][6] - m1[j][7];
1538
14.9M
  }
1539
1540
  //vertical
1541
8.38M
  for( i = 0; i < 8; i++ )
1542
7.45M
  {
1543
7.45M
    m1[0][i] = m2[0][i] + m2[8][i];
1544
7.45M
    m1[1][i] = m2[1][i] + m2[9][i];
1545
7.45M
    m1[2][i] = m2[2][i] + m2[10][i];
1546
7.45M
    m1[3][i] = m2[3][i] + m2[11][i];
1547
7.45M
    m1[4][i] = m2[4][i] + m2[12][i];
1548
7.45M
    m1[5][i] = m2[5][i] + m2[13][i];
1549
7.45M
    m1[6][i] = m2[6][i] + m2[14][i];
1550
7.45M
    m1[7][i] = m2[7][i] + m2[15][i];
1551
7.45M
    m1[8][i] = m2[0][i] - m2[8][i];
1552
7.45M
    m1[9][i] = m2[1][i] - m2[9][i];
1553
7.45M
    m1[10][i] = m2[2][i] - m2[10][i];
1554
7.45M
    m1[11][i] = m2[3][i] - m2[11][i];
1555
7.45M
    m1[12][i] = m2[4][i] - m2[12][i];
1556
7.45M
    m1[13][i] = m2[5][i] - m2[13][i];
1557
7.45M
    m1[14][i] = m2[6][i] - m2[14][i];
1558
7.45M
    m1[15][i] = m2[7][i] - m2[15][i];
1559
1560
7.45M
    m2[0][i] = m1[0][i] + m1[4][i];
1561
7.45M
    m2[1][i] = m1[1][i] + m1[5][i];
1562
7.45M
    m2[2][i] = m1[2][i] + m1[6][i];
1563
7.45M
    m2[3][i] = m1[3][i] + m1[7][i];
1564
7.45M
    m2[4][i] = m1[0][i] - m1[4][i];
1565
7.45M
    m2[5][i] = m1[1][i] - m1[5][i];
1566
7.45M
    m2[6][i] = m1[2][i] - m1[6][i];
1567
7.45M
    m2[7][i] = m1[3][i] - m1[7][i];
1568
7.45M
    m2[8][i] = m1[8][i] + m1[12][i];
1569
7.45M
    m2[9][i] = m1[9][i] + m1[13][i];
1570
7.45M
    m2[10][i] = m1[10][i] + m1[14][i];
1571
7.45M
    m2[11][i] = m1[11][i] + m1[15][i];
1572
7.45M
    m2[12][i] = m1[8][i] - m1[12][i];
1573
7.45M
    m2[13][i] = m1[9][i] - m1[13][i];
1574
7.45M
    m2[14][i] = m1[10][i] - m1[14][i];
1575
7.45M
    m2[15][i] = m1[11][i] - m1[15][i];
1576
1577
7.45M
    m1[0][i] = m2[0][i] + m2[2][i];
1578
7.45M
    m1[1][i] = m2[1][i] + m2[3][i];
1579
7.45M
    m1[2][i] = m2[0][i] - m2[2][i];
1580
7.45M
    m1[3][i] = m2[1][i] - m2[3][i];
1581
7.45M
    m1[4][i] = m2[4][i] + m2[6][i];
1582
7.45M
    m1[5][i] = m2[5][i] + m2[7][i];
1583
7.45M
    m1[6][i] = m2[4][i] - m2[6][i];
1584
7.45M
    m1[7][i] = m2[5][i] - m2[7][i];
1585
7.45M
    m1[8][i] = m2[8][i] + m2[10][i];
1586
7.45M
    m1[9][i] = m2[9][i] + m2[11][i];
1587
7.45M
    m1[10][i] = m2[8][i] - m2[10][i];
1588
7.45M
    m1[11][i] = m2[9][i] - m2[11][i];
1589
7.45M
    m1[12][i] = m2[12][i] + m2[14][i];
1590
7.45M
    m1[13][i] = m2[13][i] + m2[15][i];
1591
7.45M
    m1[14][i] = m2[12][i] - m2[14][i];
1592
7.45M
    m1[15][i] = m2[13][i] - m2[15][i];
1593
1594
7.45M
    m2[0][i] = m1[0][i] + m1[1][i];
1595
7.45M
    m2[1][i] = m1[0][i] - m1[1][i];
1596
7.45M
    m2[2][i] = m1[2][i] + m1[3][i];
1597
7.45M
    m2[3][i] = m1[2][i] - m1[3][i];
1598
7.45M
    m2[4][i] = m1[4][i] + m1[5][i];
1599
7.45M
    m2[5][i] = m1[4][i] - m1[5][i];
1600
7.45M
    m2[6][i] = m1[6][i] + m1[7][i];
1601
7.45M
    m2[7][i] = m1[6][i] - m1[7][i];
1602
7.45M
    m2[8][i] = m1[8][i] + m1[9][i];
1603
7.45M
    m2[9][i] = m1[8][i] - m1[9][i];
1604
7.45M
    m2[10][i] = m1[10][i] + m1[11][i];
1605
7.45M
    m2[11][i] = m1[10][i] - m1[11][i];
1606
7.45M
    m2[12][i] = m1[12][i] + m1[13][i];
1607
7.45M
    m2[13][i] = m1[12][i] - m1[13][i];
1608
7.45M
    m2[14][i] = m1[14][i] + m1[15][i];
1609
7.45M
    m2[15][i] = m1[14][i] - m1[15][i];
1610
7.45M
  }
1611
1612
15.8M
  for( i = 0; i < 16; i++ )
1613
14.9M
  {
1614
134M
    for( j = 0; j < 8; j++ )
1615
119M
    {
1616
119M
      sad += abs( m2[i][j] );
1617
119M
    }
1618
14.9M
  }
1619
  
1620
931k
  sad -= abs( m2[0][0] );
1621
931k
  sad += abs( m2[0][0] ) >> 2;
1622
931k
  sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
1623
1624
931k
  return sad;
1625
931k
}
1626
1627
static Distortion xCalcHADs4x8( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1628
168k
{
1629
168k
  int k, i, j, jj, sad = 0;
1630
168k
  int diff[32], m1[8][4], m2[8][4];
1631
1.51M
  for( k = 0; k < 32; k += 4 )
1632
1.34M
  {
1633
1.34M
    diff[k + 0] = piOrg[0] - piCur[0];
1634
1.34M
    diff[k + 1] = piOrg[1] - piCur[1];
1635
1.34M
    diff[k + 2] = piOrg[2] - piCur[2];
1636
1.34M
    diff[k + 3] = piOrg[3] - piCur[3];
1637
1638
1.34M
    piCur += iStrideCur;
1639
1.34M
    piOrg += iStrideOrg;
1640
1.34M
  }
1641
1642
  //horizontal
1643
1.51M
  for( j = 0; j < 8; j++ )
1644
1.34M
  {
1645
1.34M
    jj = j << 2;
1646
1.34M
    m2[j][0] = diff[jj] + diff[jj + 2];
1647
1.34M
    m2[j][1] = diff[jj + 1] + diff[jj + 3];
1648
1.34M
    m2[j][2] = diff[jj] - diff[jj + 2];
1649
1.34M
    m2[j][3] = diff[jj + 1] - diff[jj + 3];
1650
1651
1.34M
    m1[j][0] = m2[j][0] + m2[j][1];
1652
1.34M
    m1[j][1] = m2[j][0] - m2[j][1];
1653
1.34M
    m1[j][2] = m2[j][2] + m2[j][3];
1654
1.34M
    m1[j][3] = m2[j][2] - m2[j][3];
1655
1.34M
  }
1656
1657
  //vertical
1658
842k
  for( i = 0; i < 4; i++ )
1659
673k
  {
1660
673k
    m2[0][i] = m1[0][i] + m1[4][i];
1661
673k
    m2[1][i] = m1[1][i] + m1[5][i];
1662
673k
    m2[2][i] = m1[2][i] + m1[6][i];
1663
673k
    m2[3][i] = m1[3][i] + m1[7][i];
1664
673k
    m2[4][i] = m1[0][i] - m1[4][i];
1665
673k
    m2[5][i] = m1[1][i] - m1[5][i];
1666
673k
    m2[6][i] = m1[2][i] - m1[6][i];
1667
673k
    m2[7][i] = m1[3][i] - m1[7][i];
1668
1669
673k
    m1[0][i] = m2[0][i] + m2[2][i];
1670
673k
    m1[1][i] = m2[1][i] + m2[3][i];
1671
673k
    m1[2][i] = m2[0][i] - m2[2][i];
1672
673k
    m1[3][i] = m2[1][i] - m2[3][i];
1673
673k
    m1[4][i] = m2[4][i] + m2[6][i];
1674
673k
    m1[5][i] = m2[5][i] + m2[7][i];
1675
673k
    m1[6][i] = m2[4][i] - m2[6][i];
1676
673k
    m1[7][i] = m2[5][i] - m2[7][i];
1677
1678
673k
    m2[0][i] = m1[0][i] + m1[1][i];
1679
673k
    m2[1][i] = m1[0][i] - m1[1][i];
1680
673k
    m2[2][i] = m1[2][i] + m1[3][i];
1681
673k
    m2[3][i] = m1[2][i] - m1[3][i];
1682
673k
    m2[4][i] = m1[4][i] + m1[5][i];
1683
673k
    m2[5][i] = m1[4][i] - m1[5][i];
1684
673k
    m2[6][i] = m1[6][i] + m1[7][i];
1685
673k
    m2[7][i] = m1[6][i] - m1[7][i];
1686
673k
  }
1687
1688
1.51M
  for( i = 0; i < 8; i++ )
1689
1.34M
  {
1690
6.73M
    for( j = 0; j < 4; j++ )
1691
5.38M
    {
1692
5.38M
      sad += abs( m2[i][j] );
1693
5.38M
    }
1694
1.34M
  }
1695
  
1696
168k
  sad -= abs( m2[0][0] );
1697
168k
  sad += abs( m2[0][0] ) >> 2;
1698
168k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1699
1700
168k
  return sad;
1701
168k
}
1702
1703
static Distortion xCalcHADs8x4( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur )
1704
175k
{
1705
175k
  int k, i, j, jj, sad = 0;
1706
175k
  int diff[32], m1[4][8], m2[4][8];
1707
876k
  for( k = 0; k < 32; k += 8 )
1708
701k
  {
1709
701k
    diff[k + 0] = piOrg[0] - piCur[0];
1710
701k
    diff[k + 1] = piOrg[1] - piCur[1];
1711
701k
    diff[k + 2] = piOrg[2] - piCur[2];
1712
701k
    diff[k + 3] = piOrg[3] - piCur[3];
1713
701k
    diff[k + 4] = piOrg[4] - piCur[4];
1714
701k
    diff[k + 5] = piOrg[5] - piCur[5];
1715
701k
    diff[k + 6] = piOrg[6] - piCur[6];
1716
701k
    diff[k + 7] = piOrg[7] - piCur[7];
1717
1718
701k
    piCur += iStrideCur;
1719
701k
    piOrg += iStrideOrg;
1720
701k
  }
1721
1722
  //horizontal
1723
876k
  for( j = 0; j < 4; j++ )
1724
701k
  {
1725
701k
    jj = j << 3;
1726
1727
701k
    m2[j][0] = diff[jj] + diff[jj + 4];
1728
701k
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
1729
701k
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
1730
701k
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
1731
701k
    m2[j][4] = diff[jj] - diff[jj + 4];
1732
701k
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
1733
701k
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
1734
701k
    m2[j][7] = diff[jj + 3] - diff[jj + 7];
1735
1736
701k
    m1[j][0] = m2[j][0] + m2[j][2];
1737
701k
    m1[j][1] = m2[j][1] + m2[j][3];
1738
701k
    m1[j][2] = m2[j][0] - m2[j][2];
1739
701k
    m1[j][3] = m2[j][1] - m2[j][3];
1740
701k
    m1[j][4] = m2[j][4] + m2[j][6];
1741
701k
    m1[j][5] = m2[j][5] + m2[j][7];
1742
701k
    m1[j][6] = m2[j][4] - m2[j][6];
1743
701k
    m1[j][7] = m2[j][5] - m2[j][7];
1744
1745
701k
    m2[j][0] = m1[j][0] + m1[j][1];
1746
701k
    m2[j][1] = m1[j][0] - m1[j][1];
1747
701k
    m2[j][2] = m1[j][2] + m1[j][3];
1748
701k
    m2[j][3] = m1[j][2] - m1[j][3];
1749
701k
    m2[j][4] = m1[j][4] + m1[j][5];
1750
701k
    m2[j][5] = m1[j][4] - m1[j][5];
1751
701k
    m2[j][6] = m1[j][6] + m1[j][7];
1752
701k
    m2[j][7] = m1[j][6] - m1[j][7];
1753
701k
  }
1754
1755
  //vertical
1756
1.57M
  for( i = 0; i < 8; i++ )
1757
1.40M
  {
1758
1.40M
    m1[0][i] = m2[0][i] + m2[2][i];
1759
1.40M
    m1[1][i] = m2[1][i] + m2[3][i];
1760
1.40M
    m1[2][i] = m2[0][i] - m2[2][i];
1761
1.40M
    m1[3][i] = m2[1][i] - m2[3][i];
1762
1763
1.40M
    m2[0][i] = m1[0][i] + m1[1][i];
1764
1.40M
    m2[1][i] = m1[0][i] - m1[1][i];
1765
1.40M
    m2[2][i] = m1[2][i] + m1[3][i];
1766
1.40M
    m2[3][i] = m1[2][i] - m1[3][i];
1767
1.40M
  }
1768
1769
876k
  for( i = 0; i < 4; i++ )
1770
701k
  {
1771
6.31M
    for( j = 0; j < 8; j++ )
1772
5.60M
    {
1773
5.60M
      sad += abs( m2[i][j] );
1774
5.60M
    }
1775
701k
  }
1776
  
1777
175k
  sad -= abs( m2[0][0] );
1778
175k
  sad += abs( m2[0][0] ) >> 2;
1779
175k
  sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
1780
1781
175k
  return sad;
1782
175k
}
1783
1784
Distortion RdCost::xGetHAD2SADs( const DistParam &rcDtParam )
1785
946k
{
1786
946k
  if( rcDtParam.applyWeight )
1787
0
  {
1788
0
    THROW(" no support");
1789
0
  }
1790
1791
946k
  Distortion distHad = xGetHADs<false>( rcDtParam );
1792
946k
  Distortion distSad = 0;
1793
946k
  {
1794
946k
    CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering");
1795
1796
946k
    const Pel* piOrg  = rcDtParam.org.buf;
1797
946k
    const Pel* piCur  = rcDtParam.cur.buf;
1798
946k
    int  iRows        = rcDtParam.org.height>>2;
1799
946k
    int  iCols        = rcDtParam.org.width<<2;
1800
1801
946k
    Distortion uiSum = 0;
1802
1803
8.14M
    for( int y = 0; y < iRows;  y++ )
1804
7.20M
    {
1805
76.3M
      for (int n = 0; n < iCols; n+=16 )
1806
69.1M
      {
1807
69.1M
        uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
1808
69.1M
        uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
1809
69.1M
        uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
1810
69.1M
        uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
1811
69.1M
        uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
1812
69.1M
        uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
1813
69.1M
        uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
1814
69.1M
        uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
1815
69.1M
        uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
1816
69.1M
        uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
1817
69.1M
        uiSum += abs( piOrg[n+10] - piCur[n+10] );
1818
69.1M
        uiSum += abs( piOrg[n+11] - piCur[n+11] );
1819
69.1M
        uiSum += abs( piOrg[n+12] - piCur[n+12] );
1820
69.1M
        uiSum += abs( piOrg[n+13] - piCur[n+13] );
1821
69.1M
        uiSum += abs( piOrg[n+14] - piCur[n+14] );
1822
69.1M
        uiSum += abs( piOrg[n+15] - piCur[n+15] );
1823
69.1M
      }
1824
7.20M
      piOrg += iCols;
1825
7.20M
      piCur += iCols;
1826
7.20M
    }
1827
1828
946k
    distSad = (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1829
946k
  }
1830
1831
0
  return std::min( distHad, 2*distSad);
1832
946k
}
1833
1834
template<bool fastHad>
1835
Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
1836
1.57M
{
1837
1.57M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.57M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.57M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.57M
  const int  iRows = rcDtParam.org.height;
1844
1.57M
  const int  iCols = rcDtParam.org.width;
1845
1.57M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.57M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.57M
  int  x = 0, y = 0;
1849
1850
1.57M
  Distortion uiSum = 0;
1851
1852
1.57M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
341k
  {
1854
842k
    for( y = 0; y < iRows; y += 8 )
1855
501k
    {
1856
1.43M
      for( x = 0; x < iCols; x += 16 )
1857
928k
      {
1858
928k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
928k
      }
1860
501k
      piOrg += iStrideOrg * 8;
1861
501k
      piCur += iStrideCur * 8;
1862
501k
    }
1863
341k
  }
1864
1.22M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
338k
  {
1866
946k
    for( y = 0; y < iRows; y += 16 )
1867
608k
    {
1868
1.53M
      for( x = 0; x < iCols; x += 8 )
1869
931k
      {
1870
931k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
931k
      }
1872
608k
      piOrg += iStrideOrg * 16;
1873
608k
      piCur += iStrideCur * 16;
1874
608k
    }
1875
338k
  }
1876
891k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
94.9k
  {
1878
189k
    for( y = 0; y < iRows; y += 4 )
1879
94.9k
    {
1880
270k
      for( x = 0; x < iCols; x += 8 )
1881
175k
      {
1882
175k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
175k
      }
1884
94.9k
      piOrg += iStrideOrg * 4;
1885
94.9k
      piCur += iStrideCur * 4;
1886
94.9k
    }
1887
94.9k
  }
1888
796k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
89.1k
  {
1890
257k
    for( y = 0; y < iRows; y += 8 )
1891
168k
    {
1892
336k
      for( x = 0; x < iCols; x += 4 )
1893
168k
      {
1894
168k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
168k
      }
1896
168k
      piOrg += iStrideOrg * 8;
1897
168k
      piCur += iStrideCur * 8;
1898
168k
    }
1899
89.1k
  }
1900
707k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
707k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
632k
  {
1914
3.30M
    for( y = 0; y < iRows; y += 8 )
1915
2.67M
    {
1916
18.1M
      for( x = 0; x < iCols; x += 8 )
1917
15.4M
      {
1918
15.4M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
15.4M
      }
1920
2.67M
      piOrg += 8*iStrideOrg;
1921
2.67M
      piCur += 8*iStrideCur;
1922
2.67M
    }
1923
632k
  }
1924
74.8k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
45.0k
  {
1926
90.0k
    for( y = 0; y < iRows; y += 4 )
1927
45.0k
    {
1928
90.0k
      for( x = 0; x < iCols; x += 4 )
1929
45.0k
      {
1930
45.0k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
45.0k
      }
1932
45.0k
      piOrg += 4*iStrideOrg;
1933
45.0k
      piCur += 4*iStrideCur;
1934
45.0k
    }
1935
45.0k
  }
1936
29.7k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
29.7k
  {
1938
59.5k
    for( y = 0; y < iRows; y += 2 )
1939
29.7k
    {
1940
195k
      for( x = 0; x < iCols; x += 2 )
1941
165k
      {
1942
165k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
165k
      }
1944
29.7k
      piOrg += 2*iStrideOrg;
1945
29.7k
      piCur += 2*iStrideCur;
1946
29.7k
    }
1947
29.7k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.57M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.57M
}
unsigned long vvenc::RdCost::xGetHADs<false>(vvenc::DistParam const&)
Line
Count
Source
1836
1.57M
{
1837
1.57M
  if( rcDtParam.applyWeight )
1838
0
  {
1839
0
    THROW(" no support");
1840
0
  }
1841
1.57M
  const Pel* piOrg = rcDtParam.org.buf;
1842
1.57M
  const Pel* piCur = rcDtParam.cur.buf;
1843
1.57M
  const int  iRows = rcDtParam.org.height;
1844
1.57M
  const int  iCols = rcDtParam.org.width;
1845
1.57M
  const int  iStrideCur = rcDtParam.cur.stride;
1846
1.57M
  const int  iStrideOrg = rcDtParam.org.stride;
1847
1848
1.57M
  int  x = 0, y = 0;
1849
1850
1.57M
  Distortion uiSum = 0;
1851
1852
1.57M
  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
1853
341k
  {
1854
842k
    for( y = 0; y < iRows; y += 8 )
1855
501k
    {
1856
1.43M
      for( x = 0; x < iCols; x += 16 )
1857
928k
      {
1858
928k
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1859
928k
      }
1860
501k
      piOrg += iStrideOrg * 8;
1861
501k
      piCur += iStrideCur * 8;
1862
501k
    }
1863
341k
  }
1864
1.22M
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
1865
338k
  {
1866
946k
    for( y = 0; y < iRows; y += 16 )
1867
608k
    {
1868
1.53M
      for( x = 0; x < iCols; x += 8 )
1869
931k
      {
1870
931k
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1871
931k
      }
1872
608k
      piOrg += iStrideOrg * 16;
1873
608k
      piCur += iStrideCur * 16;
1874
608k
    }
1875
338k
  }
1876
891k
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
1877
94.9k
  {
1878
189k
    for( y = 0; y < iRows; y += 4 )
1879
94.9k
    {
1880
270k
      for( x = 0; x < iCols; x += 8 )
1881
175k
      {
1882
175k
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1883
175k
      }
1884
94.9k
      piOrg += iStrideOrg * 4;
1885
94.9k
      piCur += iStrideCur * 4;
1886
94.9k
    }
1887
94.9k
  }
1888
796k
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
1889
89.1k
  {
1890
257k
    for( y = 0; y < iRows; y += 8 )
1891
168k
    {
1892
336k
      for( x = 0; x < iCols; x += 4 )
1893
168k
      {
1894
168k
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1895
168k
      }
1896
168k
      piOrg += iStrideOrg * 8;
1897
168k
      piCur += iStrideCur * 8;
1898
168k
    }
1899
89.1k
  }
1900
707k
  else if( fastHad && ( ( iRows % 32 == 0 ) && ( iCols % 32 == 0 ) ) && iRows == iCols )
1901
0
  {
1902
0
    for( y = 0; y < iRows; y += 16 )
1903
0
    {
1904
0
      for( x = 0; x < iCols; x += 16 )
1905
0
      {
1906
0
        uiSum += xCalcHADs16x16_fast( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1907
0
      }
1908
0
      piOrg += 16 * iStrideOrg;
1909
0
      piCur += 16 * iStrideCur;
1910
0
    }
1911
0
  }
1912
707k
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
1913
632k
  {
1914
3.30M
    for( y = 0; y < iRows; y += 8 )
1915
2.67M
    {
1916
18.1M
      for( x = 0; x < iCols; x += 8 )
1917
15.4M
      {
1918
15.4M
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1919
15.4M
      }
1920
2.67M
      piOrg += 8*iStrideOrg;
1921
2.67M
      piCur += 8*iStrideCur;
1922
2.67M
    }
1923
632k
  }
1924
74.8k
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
1925
45.0k
  {
1926
90.0k
    for( y = 0; y < iRows; y += 4 )
1927
45.0k
    {
1928
90.0k
      for( x = 0; x < iCols; x += 4 )
1929
45.0k
      {
1930
45.0k
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1931
45.0k
      }
1932
45.0k
      piOrg += 4*iStrideOrg;
1933
45.0k
      piCur += 4*iStrideCur;
1934
45.0k
    }
1935
45.0k
  }
1936
29.7k
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
1937
29.7k
  {
1938
59.5k
    for( y = 0; y < iRows; y += 2 )
1939
29.7k
    {
1940
195k
      for( x = 0; x < iCols; x += 2 )
1941
165k
      {
1942
165k
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
1943
165k
      }
1944
29.7k
      piOrg += 2*iStrideOrg;
1945
29.7k
      piCur += 2*iStrideCur;
1946
29.7k
    }
1947
29.7k
  }
1948
18.4E
  else
1949
18.4E
  {
1950
18.4E
    THROW( "Invalid size" );
1951
18.4E
  }
1952
1953
1.57M
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
1954
1.57M
}
Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs<true>(vvenc::DistParam const&)
1955
1956
1957
void RdCost::saveUnadjustedLambda()
1958
11.8k
{
1959
11.8k
  m_dLambda_unadjusted = m_dLambda;
1960
11.8k
  m_DistScaleUnadjusted = m_DistScale;
1961
11.8k
}
1962
1963
1964
inline Distortion getWeightedMSE(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift)
1965
0
{
1966
0
  const Intermediate_Int iTemp = org - cur;
1967
0
  return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift);
1968
0
}
1969
1970
template<int csx>
1971
static Distortion lumaWeightedSSE_Core( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights )
1972
0
{
1973
0
        int  iRows = rcDtParam.org.height;
1974
0
  const Pel* piOrg = rcDtParam.org.buf;
1975
0
  const Pel* piCur = rcDtParam.cur.buf;
1976
0
  const int  iCols = rcDtParam.org.width;
1977
0
  const int  iStrideCur = rcDtParam.cur.stride;
1978
0
  const int  iStrideOrg = rcDtParam.org.stride;
1979
0
  const Pel* piOrgLuma        = rcDtParam.orgLuma->buf;
1980
0
  const int  iStrideOrgLuma   = rcDtParam.orgLuma->stride;
1981
1982
0
  Distortion uiSum   = 0;
1983
0
  uint32_t uiShift   = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
1984
1985
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
1986
0
  const int cf =  1 - ( iCols & 1 );
1987
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
1988
0
  const ComponentID compId = rcDtParam.compID;
1989
0
  const size_t  cShiftY    = getComponentScaleY(compId, chmFmt);
1990
1991
0
  for( ; iRows != 0; iRows-- )
1992
0
  {
1993
0
    for (int n = 0; n < iCols; n+=2 )
1994
0
    {
1995
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], lumaWeights[piOrgLuma[(n   )<<csx]], uiShift );
1996
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], lumaWeights[piOrgLuma[(n+cf)<<csx]], uiShift );
1997
0
    }
1998
1999
0
    piOrg     += iStrideOrg;
2000
0
    piCur     += iStrideCur;
2001
0
    piOrgLuma += iStrideOrgLuma<<cShiftY;
2002
0
  }
2003
2004
0
  return ( uiSum >> ( 1 - cf ) );
2005
0
}
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
Unexecuted instantiation: RdCost.cpp:unsigned long vvenc::lumaWeightedSSE_Core<1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*)
2006
2007
static Distortion fixWeightedSSE_Core( const DistParam& rcDtParam, uint32_t fixedPTweight )
2008
0
{
2009
0
        int  iRows = rcDtParam.org.height;
2010
0
  const Pel* piOrg = rcDtParam.org.buf;
2011
0
  const Pel* piCur = rcDtParam.cur.buf;
2012
0
  const int  iCols = rcDtParam.org.width;
2013
0
  const int  iStrideCur = rcDtParam.cur.stride;
2014
0
  const int  iStrideOrg = rcDtParam.org.stride;
2015
2016
0
  Distortion uiSum   = 0;
2017
0
  uint32_t uiShift = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1);
2018
2019
  // cf, column factor, offset of the second column, to be set to '0' for width of '1'
2020
0
  const int cf =  1 - ( iCols & 1 );
2021
0
  CHECK( ( iCols & 1 ) && iCols != 1, "Width can only be even or equal to '1'!" );
2022
  
2023
0
  for( ; iRows != 0; iRows-- )
2024
0
  {
2025
0
    for (int n = 0; n < iCols; n+=2 )
2026
0
    {
2027
0
      uiSum += getWeightedMSE( piOrg[n   ], piCur[n   ], fixedPTweight, uiShift );
2028
0
      uiSum += getWeightedMSE( piOrg[n+cf], piCur[n+cf], fixedPTweight, uiShift );
2029
0
    }
2030
0
    piOrg += iStrideOrg;
2031
0
    piCur += iStrideCur;
2032
0
  }
2033
2034
0
  return ( uiSum >> ( 1 - cf ) );
2035
0
}
2036
2037
Distortion RdCost::xGetSSE_WTD( const DistParam &rcDtParam ) const
2038
0
{
2039
0
  if( rcDtParam.applyWeight )
2040
0
  {
2041
0
    THROW("no support");
2042
0
  }
2043
2044
0
  if ((m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG) && rcDtParam.compID != COMP_Y)
2045
0
  {
2046
0
    const uint32_t fixedPTweight = ( uint32_t ) ( m_chromaWeight * ( double ) ( 1 << 16 ) );
2047
2048
0
    return m_fxdWtdPredPtr( rcDtParam, fixedPTweight );
2049
0
  }
2050
0
  else
2051
0
  {
2052
0
    return m_wtdPredPtr[getComponentScaleX(rcDtParam.compID, m_cf)]( rcDtParam, m_cf, m_reshapeLumaLevelToWeightPLUT );
2053
0
  }
2054
2055
0
  return 0;
2056
0
}
2057
2058
0
void RdCost::xGetSAD8X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2059
0
  DistParam rcDtParamTmp0 = rcDtParam;
2060
2061
0
  DistParam rcDtParamTmp1 = rcDtParam;
2062
0
  rcDtParamTmp1.org.buf += 1;
2063
0
  rcDtParamTmp1.cur.buf -= 1;
2064
2065
0
  DistParam rcDtParamTmp2 = rcDtParam;
2066
0
  rcDtParamTmp2.org.buf += 2;
2067
0
  rcDtParamTmp2.cur.buf -= 2;
2068
2069
0
  DistParam rcDtParamTmp3 = rcDtParam;
2070
0
  rcDtParamTmp3.org.buf += 3;
2071
0
  rcDtParamTmp3.cur.buf -= 3;
2072
2073
0
  DistParam rcDtParamTmp4 = rcDtParam;
2074
0
  rcDtParamTmp4.org.buf += 4;
2075
0
  rcDtParamTmp4.cur.buf -= 4;
2076
  
2077
0
  cost[0] = (RdCost::xGetSAD8(rcDtParamTmp0)) >> 1;
2078
0
  cost[1] = (RdCost::xGetSAD8(rcDtParamTmp1)) >> 1;
2079
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD8(rcDtParamTmp2)) >> 1;
2080
0
  cost[3] = (RdCost::xGetSAD8(rcDtParamTmp3)) >> 1;
2081
0
  cost[4] = (RdCost::xGetSAD8(rcDtParamTmp4)) >> 1;
2082
0
}
2083
2084
0
void RdCost::xGetSAD16X5(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) {
2085
0
  DistParam rcDtParamTmp0 = rcDtParam;
2086
2087
0
  DistParam rcDtParamTmp1 = rcDtParam;
2088
0
  rcDtParamTmp1.org.buf += 1;
2089
0
  rcDtParamTmp1.cur.buf -= 1;
2090
2091
0
  DistParam rcDtParamTmp2 = rcDtParam;
2092
0
  rcDtParamTmp2.org.buf += 2;
2093
0
  rcDtParamTmp2.cur.buf -= 2;
2094
2095
0
  DistParam rcDtParamTmp3 = rcDtParam;
2096
0
  rcDtParamTmp3.org.buf += 3;
2097
0
  rcDtParamTmp3.cur.buf -= 3;
2098
2099
0
  DistParam rcDtParamTmp4 = rcDtParam;
2100
0
  rcDtParamTmp4.org.buf += 4;
2101
0
  rcDtParamTmp4.cur.buf -= 4;
2102
  
2103
0
  cost[0] = (RdCost::xGetSAD16(rcDtParamTmp0)) >> 1;
2104
0
  cost[1] = (RdCost::xGetSAD16(rcDtParamTmp1)) >> 1;
2105
0
  if (isCalCentrePos) cost[2] = (RdCost::xGetSAD16(rcDtParamTmp2)) >> 1;
2106
0
  cost[3] = (RdCost::xGetSAD16(rcDtParamTmp3)) >> 1;
2107
0
  cost[4] = (RdCost::xGetSAD16(rcDtParamTmp4)) >> 1;
2108
0
}
2109
2110
void RdCost::setDistParamGeo(DistParam &rcDP, const CPelBuf &org, const Pel *piRefY, int iRefStride, const Pel *mask,
2111
                          int iMaskStride, int stepX, int iMaskStride2, int bitDepth, ComponentID compID)
2112
0
{
2113
0
  rcDP.bitDepth = bitDepth;
2114
0
  rcDP.compID   = compID;
2115
2116
  // set Original & Curr Pointer / Stride
2117
0
  rcDP.org        = org;
2118
0
  rcDP.cur.buf    = piRefY;
2119
0
  rcDP.cur.stride = iRefStride;
2120
2121
  // set Mask
2122
0
  rcDP.mask        = mask;
2123
0
  rcDP.maskStride  = iMaskStride;
2124
0
  rcDP.stepX       = stepX;
2125
0
  rcDP.maskStride2 = iMaskStride2;
2126
2127
  // set Block Width / Height
2128
0
  rcDP.cur.width                     = org.width;
2129
0
  rcDP.cur.height                    = org.height;
2130
0
  rcDP.maximumDistortionForEarlyExit = MAX_DISTORTION;
2131
2132
  // set Cost function for motion estimation with Mask
2133
0
  rcDP.distFunc = m_afpDistortFunc[0][DF_SAD_WITH_MASK];
2134
0
}
2135
2136
Distortion RdCost::xGetSADwMask(const DistParam &rcDtParam)
2137
0
{
2138
0
  const Pel *    org             = rcDtParam.org.buf;
2139
0
  const Pel *    cur             = rcDtParam.cur.buf;
2140
0
  const Pel *    mask            = rcDtParam.mask;
2141
0
  const int      cols            = rcDtParam.org.width;
2142
0
  int            rows            = rcDtParam.org.height;
2143
0
  const int      subShift        = rcDtParam.subShift;
2144
0
  const int      subStep         = (1 << subShift);
2145
0
  const int      strideCur       = rcDtParam.cur.stride * subStep;
2146
0
  const int      strideOrg       = rcDtParam.org.stride * subStep;
2147
0
  const int      strideMask      = rcDtParam.maskStride * subStep;
2148
0
  const int      stepX           = rcDtParam.stepX;
2149
0
  const int      strideMask2     = rcDtParam.maskStride2;
2150
0
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
2151
2152
0
  Distortion sum = 0;
2153
0
  for (; rows != 0; rows -= subStep)
2154
0
  {
2155
0
    for (int n = 0; n < cols; n++)
2156
0
    {
2157
0
      sum += abs(org[n] - cur[n]) * *mask;
2158
0
      mask += stepX;
2159
0
    }
2160
0
    org += strideOrg;
2161
0
    cur += strideCur;
2162
0
    mask += strideMask;
2163
0
    mask += strideMask2;
2164
0
  }
2165
0
  sum <<= subShift;
2166
0
  return (sum >> distortionShift);
2167
0
}
2168
2169
Distortion RdCost::getBvCostMultiplePredsIBC(int x, int y, bool useIMV)
2170
1.27M
{
2171
1.27M
  return Distortion(m_dCostIBC * getBitsMultiplePredsIBC(x, y, useIMV));
2172
1.27M
}
2173
2174
static inline unsigned getIComponentBitsIBC( int val )
2175
3.48M
{
2176
3.48M
  if( !val ) return 1;
2177
2178
18.4E
  const unsigned int l2 = floorLog2( (val <= 0) ? (-val << 1) + 1 : (val << 1) );
2179
2180
1.75M
  return (l2 << 1) + 1;
2181
3.48M
}
2182
2183
unsigned int RdCost::getBitsMultiplePredsIBC(int x, int y, bool useIMV)
2184
1.27M
{
2185
1.27M
  int rmvH[2];
2186
1.27M
  int rmvV[2];
2187
1.27M
  rmvH[0] = x - m_bvPredictors[0].hor;
2188
1.27M
  rmvH[1] = x - m_bvPredictors[1].hor;
2189
2190
1.27M
  rmvV[0] = y - m_bvPredictors[0].ver;
2191
1.27M
  rmvV[1] = y - m_bvPredictors[1].ver;
2192
1.27M
  int absCand[2];
2193
1.27M
  absCand[0] = abs(rmvH[0]) + abs(rmvV[0]);
2194
1.27M
  absCand[1] = abs(rmvH[1]) + abs(rmvV[1]);
2195
2196
1.27M
  if (useIMV && x % 4 == 0 && y % 4 == 0)
2197
465k
  {
2198
465k
    int rmvHQP[2];
2199
465k
    int rmvVQP[2];
2200
2201
465k
    int imvShift = 2;
2202
465k
    int offset = 1 << (imvShift - 1);
2203
2204
465k
    rmvHQP[0] = (x >> 2) - ((m_bvPredictors[0].hor + offset) >> 2);
2205
465k
    rmvHQP[1] = (x >> 2) - ((m_bvPredictors[1].hor + offset) >> 2);
2206
465k
    rmvVQP[0] = (y >> 2) - ((m_bvPredictors[0].ver + offset) >> 2);
2207
465k
    rmvVQP[1] = (y >> 2) - ((m_bvPredictors[1].ver + offset) >> 2);
2208
2209
465k
    int absCandQP[2];
2210
465k
    absCandQP[0] = abs(rmvHQP[0]) + abs(rmvVQP[0]);
2211
465k
    absCandQP[1] = abs(rmvHQP[1]) + abs(rmvVQP[1]);
2212
465k
    unsigned int candBits0QP, candBits1QP;
2213
465k
    if (absCand[0] < absCand[1])
2214
0
    {
2215
0
      unsigned int candBits0 = getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2216
0
      if (absCandQP[0] < absCandQP[1])
2217
0
      {
2218
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2219
0
        return candBits0QP < candBits0 ? candBits0QP : candBits0;
2220
0
      }
2221
0
      else
2222
0
      {
2223
0
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2224
0
        return candBits1QP < candBits0 ? candBits1QP : candBits0;
2225
0
      }
2226
0
    }
2227
465k
    else
2228
465k
    {
2229
465k
      unsigned int candBits1 = getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2230
465k
      if (absCandQP[0] < absCandQP[1])
2231
0
      {
2232
0
        candBits0QP = getIComponentBitsIBC(rmvHQP[0]) + getIComponentBitsIBC(rmvVQP[0]);
2233
0
        return candBits0QP < candBits1 ? candBits0QP : candBits1;
2234
0
      }
2235
465k
      else
2236
465k
      {
2237
465k
        candBits1QP = getIComponentBitsIBC(rmvHQP[1]) + getIComponentBitsIBC(rmvVQP[1]);
2238
18.4E
        return candBits1QP < candBits1 ? candBits1QP : candBits1;
2239
465k
      }
2240
465k
    }
2241
465k
  }
2242
809k
  else
2243
809k
  {
2244
809k
    if (absCand[0] < absCand[1])
2245
0
    {
2246
0
      return getIComponentBitsIBC(rmvH[0]) + getIComponentBitsIBC(rmvV[0]);
2247
0
    }
2248
809k
    else
2249
809k
    {
2250
809k
      return getIComponentBitsIBC(rmvH[1]) + getIComponentBitsIBC(rmvV[1]);
2251
809k
    }
2252
809k
  }
2253
1.27M
}
2254
2255
} // namespace vvenc
2256
2257
//! \}
2258