Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/QuantX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/**
43
 * \file
44
 * \brief Implementation of quantization functions
45
 */
46
//#define USE_AVX2
47
// ====================================================================================================================
48
// Includes
49
// ====================================================================================================================
50
51
#pragma once
52
53
#include "CommonDefX86.h"
54
#include "Rom.h"
55
#include "QuantRDOQ2.h"
56
57
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_QUANT
58
59
//! \ingroup CommonLib
60
//! \{
61
62
namespace vvenc {
63
64
#define cond_mm_prefetch(a,b) _mm_prefetch(a,b)
65
//#define cond_mm_prefetch(a,b)
66
67
static constexpr unsigned short levmask[16] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0};
68
template<X86_VEXT vext>
69
static void DeQuantCoreSIMD(const int maxX,const int maxY,const int scale,const TCoeffSig *const piQCoef,const size_t piQCfStride,TCoeff *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
70
0
{
71
  // TODO: TCoeffSig!!!
72
0
  const int inputMinimum = -(inputMaximum+1);
73
0
  const TCoeff transformMinimum = -(transformMaximum+1);
74
0
  const int width = maxX+1;
75
76
0
  __m128i vlevmask;
77
0
  if (maxX<7)
78
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
79
0
  else
80
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
81
82
0
  if (rightShift>0)
83
0
  {
84
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
85
86
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
87
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
88
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
89
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
90
0
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
91
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
92
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
93
94
0
    if (maxX<4)
95
0
    {
96
0
      for( int y = 0; y <= maxY; y++)
97
0
      {
98
0
        __m128i v_level = maxX == 1 ? _mm_set1_epi32( *( ( int const* ) & piQCoef[y * piQCfStride] ) ) : _vv_loadl_epi64( ( __m128i const* ) &piQCoef[y * piQCfStride] );
99
0
        v_level = _mm_and_si128(v_level,vlevmask);
100
0
        v_level = _mm_max_epi16 (v_level, v_min);
101
0
        v_level = _mm_min_epi16 (v_level, v_max);
102
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
103
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
104
105
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
106
0
        v_level =  _mm_add_epi32(v_level,v_add);
107
0
        v_level = _mm_sra_epi32(v_level,v_rshift);
108
109
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
110
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
111
0
        if( maxX == 1 )
112
0
          _vv_storel_epi64( (__m128i*)(piCoef + y * width), v_level );
113
0
        else
114
0
          _mm_storeu_si128( (__m128i*)(piCoef + y * width), v_level );
115
0
      }
116
0
    }
117
0
    else
118
0
    {
119
0
      for( int y = 0; y <= maxY; y++)
120
0
      {
121
0
        for( int x = 0; x <= maxX; x+=8)
122
0
        {
123
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const* ) &piQCoef[x + y * piQCfStride] );
124
0
          v_level = _mm_and_si128(v_level,vlevmask);
125
0
          v_level = _mm_max_epi16 (v_level, v_min);
126
0
          v_level = _mm_min_epi16 (v_level, v_max);
127
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
128
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
129
130
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
131
0
          v_level =  _mm_add_epi32(v_level,v_add);
132
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
133
134
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
135
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
136
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
137
138
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
139
0
          v_level =  _mm_add_epi32(v_level,v_add);
140
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
141
142
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
143
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
144
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
145
0
        }
146
0
      }
147
0
    }
148
0
  }
149
0
  else  // rightshift <0
150
0
  {
151
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
152
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
153
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
154
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
155
0
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
156
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
157
158
0
    if (maxX<4)
159
0
    {
160
0
      for( int y = 0; y <= maxY; y++)
161
0
      {
162
0
        __m128i v_level = maxX == 1 ? _mm_set1_epi32( *( ( int const* ) & piQCoef[y * piQCfStride] ) ) : _vv_loadl_epi64( ( __m128i const* ) &piQCoef[y * piQCfStride] );
163
0
        v_level = _mm_and_si128(v_level,vlevmask);
164
165
0
        v_level = _mm_max_epi16 (v_level, v_min);
166
0
        v_level = _mm_min_epi16 (v_level, v_max);
167
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
168
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
169
170
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
171
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
172
173
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
174
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
175
176
0
        if( maxX == 1 )
177
0
        {
178
0
          _vv_storel_epi64( (__m128i*)(piCoef + y * width), v_level );
179
0
        }
180
0
        else
181
0
          _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
182
0
      }
183
0
    }
184
0
    else
185
0
    {
186
0
      for( int y = 0; y <= maxY; y++)
187
0
      {
188
0
        for( int x = 0; x <= maxX; x+=8)
189
0
        {
190
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const* ) &piQCoef[x + y * piQCfStride] );
191
0
          v_level = _mm_and_si128(v_level,vlevmask);
192
0
          v_level = _mm_max_epi16 (v_level, v_min);
193
0
          v_level = _mm_min_epi16 (v_level, v_max);
194
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
195
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
196
197
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
198
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
199
200
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
201
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
202
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
203
204
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
205
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
206
207
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
208
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
209
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
210
0
        }
211
0
      }
212
0
    }
213
0
  }
214
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvenc::DeQuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvenc::DeQuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, short const*, unsigned long, int*, int, int, int)
215
216
template<X86_VEXT vext>
217
static void QuantCoreSIMD(const TransformUnit tu, const ComponentID compID, const CCoeffBuf& piCoef,CoeffSigBuf piQCoef,TCoeff &uiAbsSum, int &lastScanPos,TCoeff *deltaU,const int defaultQuantisationCoefficient,const int iQBits,const int64_t iAdd,const TCoeff entropyCodingMinimum,const TCoeff entropyCodingMaximum,const bool signHiding, const TCoeff m_thrVal)
218
0
{
219
0
  CoeffCodingContext cctx( tu, compID, signHiding );
220
221
0
  const CompArea &rect      = tu.blocks[compID];
222
0
  const uint32_t uiWidth    = rect.width;
223
0
  const uint32_t uiHeight   = rect.height;
224
0
  const uint32_t log2CGSize = cctx.log2CGSize();
225
226
0
  uiAbsSum = 0;
227
228
0
  const int iCGSize   = 1 << log2CGSize;
229
230
0
  const uint32_t lfnstIdx = tu.cu->lfnstIdx;
231
0
  const int iCGNum   = lfnstIdx > 0 ? 1 : std::min<int>(JVET_C0024_ZERO_OUT_TH, uiWidth) * std::min<int>(JVET_C0024_ZERO_OUT_TH, uiHeight) >> cctx.log2CGSize();
232
0
  int       iScanPos = ( iCGNum << log2CGSize ) - 1;
233
234
0
  if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 ) ) )
235
0
  {
236
0
    iScanPos = 7;
237
0
  }
238
239
  // Find first non-zero coeff
240
0
  for( ; iScanPos > 0; iScanPos-- )
241
0
  {
242
0
    uint32_t uiBlkPos = cctx.blockPos( iScanPos );
243
0
    if( piCoef.buf[uiBlkPos] )
244
0
      break;
245
0
  }
246
247
  //////////////////////////////////////////////////////////////////////////
248
  //  Loop over sub-sets (coefficient groups)
249
  //////////////////////////////////////////////////////////////////////////
250
  
251
0
  TCoeff thres = 0, useThres = 0;
252
  
253
0
  if( iQBits )
254
0
    thres = TCoeff( ( int64_t( m_thrVal ) << ( iQBits - 1 ) ) );
255
0
  else
256
0
    thres = TCoeff( ( int64_t( m_thrVal >> 1 ) << iQBits ) );
257
258
0
  useThres = thres / ( defaultQuantisationCoefficient << 2 );
259
260
0
  const bool is4x4sbb = log2CGSize == 4 && cctx.log2CGWidth() == 2;
261
262
0
  int subSetId = iScanPos >> log2CGSize;
263
  // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold
264
0
  if( is4x4sbb && iScanPos >= 16 )
265
0
  {
266
0
    for( ; subSetId >= 1; subSetId-- )
267
0
    {
268
      // move the pointer to the beginning of the current subblock
269
0
      const int iScanPosinCG = iScanPos & ( iCGSize - 1 );
270
0
      const int firstTestPos = iScanPos - iScanPosinCG;
271
0
      uint32_t  uiBlkPos     = cctx.blockPos( firstTestPos );
272
273
0
      const __m128i xdfTh = _mm_set1_epi32( useThres );
274
275
      // read first line of the subblock and check for coefficients larger than the threshold
276
      // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width
277
0
      __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) );
278
0
      __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh );
279
280
      // same for the next line in the subblock
281
0
      uiBlkPos += uiWidth;
282
0
      xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) );
283
0
      xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
284
285
      // and the third line
286
0
      uiBlkPos += uiWidth;
287
0
      xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) );
288
0
      xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
289
290
      // and the last line
291
0
      uiBlkPos += uiWidth;
292
0
      xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) );
293
0
      xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
294
295
0
      if( _mm_testz_si128( xdf, xdf ) )
296
0
      {
297
0
        iScanPos -= iScanPosinCG + 1;
298
0
        continue;
299
0
      }
300
0
      else
301
0
      {
302
0
        break;
303
0
      }
304
0
    }
305
0
  }
306
307
0
  const int qBits8 = iQBits - 8;
308
0
  piQCoef.memset( 0 );
309
310
0
  lastScanPos = iScanPos;
311
312
0
  if( is4x4sbb && ( iScanPos & 15 ) == 15 )
313
0
  {
314
#if defined( USE_AVX2 ) && 0 // sometimes has undefined behavior
315
    if( vext >= AVX2 )
316
    {
317
      const __m256i vNull   = _mm256_setzero_si256();
318
      const __m256i vQuantCoeff = _mm256_set1_epi32(defaultQuantisationCoefficient);
319
      const __m256i vAdd    = _mm256_set1_epi64x(iAdd);
320
      const __m256i vMax    = _mm256_set1_epi32(entropyCodingMaximum);
321
      const __m256i vMin    = _mm256_set1_epi32(entropyCodingMinimum);
322
      const __m256i vMask   = _mm256_set_epi32( 0, -1, 0, -1, 0, -1, 0, -1 );
323
      __m256i vAbsSum = vNull;
324
325
      for( subSetId = iScanPos >> log2CGSize; subSetId >= 0; subSetId-- )
326
      {
327
        int uiBlockPos = cctx.blockPos( subSetId << log2CGSize );
328
329
        for( int line = 0; line < 4; line += 2, uiBlockPos += ( 2 * uiWidth ) )
330
        {
331
          __m256i  vLevel = _mm256_castsi128_si256 (         _mm_loadu_si128((__m128i *)&piCoef.buf[uiBlockPos]           )    ); // coeff7,coeff6,coeff5,coeff4,coeff3,coeff2,coeff1,coeff0,
332
                   vLevel = _mm256_inserti128_si256( vLevel, _mm_loadu_si128((__m128i *)&piCoef.buf[uiBlockPos + uiWidth] ), 1 ); // coeff7,coeff6,coeff5,coeff4,coeff3,coeff2,coeff1,coeff0,
333
          __m256i vSign = _mm256_cmpgt_epi32 (vNull,vLevel);                            // sign3,sign2,sign1,sign0 FFFF or 0000
334
          vLevel = _mm256_abs_epi32 (vLevel);
335
          __m256i vdeltaU0 = _mm256_mul_epu32(vLevel,vQuantCoeff);                      // Tmp2,Tmp0
336
          __m256i  vdeltaU1 = _mm256_srli_si256(vLevel,4);                              // abs(0,vLevel3,vLevel2,vLevel1)
337
          vdeltaU1 = _mm256_mul_epu32(vdeltaU1,vQuantCoeff);                            // Tmp3,Tmp1
338
          __m256i vTmpLevel_0 = _mm256_add_epi64(vdeltaU0,vAdd);
339
          __m256i vTmpLevel_1 = _mm256_add_epi64(vdeltaU1,vAdd);
340
          const __m128i vQBits = _mm_cvtsi32_si128(iQBits)
341
          vTmpLevel_0 = _mm256_srl_epi64(vTmpLevel_0,vQBits);                          // Int32 Tmp2,Tmp0
342
          vTmpLevel_1 = _mm256_srl_epi64(vTmpLevel_1,vQBits);                          // Int32 Tmp3,Tmp1
343
344
          if (signHiding)
345
          {
346
            __m256i vBS0 = _mm256_sll_epi64(vTmpLevel_0,vQBits);
347
            __m256i vBS1 = _mm256_sll_epi64(vTmpLevel_1,vQBits);
348
349
            vdeltaU0 = _mm256_sub_epi64(vdeltaU0,vBS0);
350
            vdeltaU1 = _mm256_sub_epi64(vdeltaU1,vBS1);
351
            const __m128i vQBits8 = _mm_cvtsi32_si128(qBits8)
352
            vdeltaU0 = _mm256_srl_epi64(vdeltaU0,vQBits8);
353
            vdeltaU1 = _mm256_srl_epi64(vdeltaU1,vQBits8);
354
            vdeltaU0 = _mm256_and_si256(vdeltaU0,vMask);
355
            vdeltaU1 = _mm256_and_si256(vdeltaU1,vMask);
356
            vdeltaU1 = _mm256_slli_epi64(vdeltaU1,32);
357
            vdeltaU0 = _mm256_or_si256(vdeltaU0,vdeltaU1);
358
            _mm_storeu_si128( ( __m128i * )&deltaU[uiBlockPos],          _mm256_castsi256_si128  (vdeltaU0));
359
            _mm_storeu_si128( ( __m128i * )&deltaU[uiBlockPos + uiWidth],_mm256_extracti128_si256(vdeltaU0, 1));
360
          }
361
          __m256i vquantMag0 = _mm256_and_si256(vTmpLevel_0,vMask);
362
          __m256i vquantMag1 = _mm256_and_si256(vTmpLevel_1,vMask);
363
          vquantMag1  = _mm256_slli_epi64(vquantMag1,32);
364
          vTmpLevel_0 = _mm256_or_si256(vquantMag0,vquantMag1);
365
          vAbsSum     = _mm256_add_epi32(vAbsSum,vTmpLevel_0);
366
          vTmpLevel_1 = _mm256_and_si256(vTmpLevel_0,vSign);                            // mask only neg values
367
          vTmpLevel_0 = _mm256_andnot_si256(vSign,vTmpLevel_0);                         // mask only pos values
368
          vTmpLevel_0 = _mm256_sub_epi32(vTmpLevel_0,vTmpLevel_1);
369
          vTmpLevel_0 = _mm256_min_epi32(vMax, _mm256_max_epi32(vMin,vTmpLevel_0));     // clip to 16 Bit
370
          vTmpLevel_0 = _mm256_packs_epi32(vTmpLevel_0,vTmpLevel_0);
371
          _vv_storel_epi64( ( __m128i * )&piQCoef.buf[uiBlockPos],          _mm256_castsi256_si128  (vTmpLevel_0));
372
          _vv_storel_epi64( ( __m128i * )&piQCoef.buf[uiBlockPos + uiWidth],_mm256_extracti128_si256(vTmpLevel_0, 1));
373
        }
374
      }
375
376
      __m128i xAbsSum = _mm_add_epi32( _mm256_castsi256_si128( vAbsSum ), _mm256_extracti128_si256( vAbsSum, 1 ) );
377
      xAbsSum = _mm_hadd_epi32( xAbsSum, xAbsSum );
378
      xAbsSum = _mm_hadd_epi32( xAbsSum, xAbsSum );
379
380
      uiAbsSum += _mm_cvtsi128_si32( xAbsSum );
381
    }  //AVX2
382
    else
383
#endif
384
0
    {
385
0
      const __m128i vNull = _mm_setzero_si128();
386
0
      const __m128i vQuantCoeff = _mm_set1_epi32(defaultQuantisationCoefficient);
387
0
      const __m128i vAdd  = _mm_set1_epi64x(iAdd);
388
0
      const __m128i vMin  = _mm_set1_epi32(entropyCodingMinimum);
389
0
      const __m128i vMax  = _mm_set1_epi32(entropyCodingMaximum);
390
0
      const __m128i vMask = _mm_set_epi32(0, -1, 0, -1);
391
0
      __m128i vAbsSum = vNull;
392
393
0
      for( subSetId = iScanPos >> log2CGSize; subSetId >= 0; subSetId-- )
394
0
      {
395
0
        int uiBlockPos = cctx.blockPos( subSetId << log2CGSize );
396
397
0
        for( int line = 0; line < 4; line++, uiBlockPos += uiWidth )
398
0
        {
399
0
          __m128i vLevel = _mm_loadu_si128((__m128i*)&piCoef.buf[uiBlockPos]);      // coeff3,coeff2,coeff1,coeff0,
400
0
          __m128i vSign = _mm_cmpgt_epi32 (vNull,vLevel);                           // sign3,sign2,sign1,sign0 FFFF or 0000
401
0
          vLevel = _mm_abs_epi32 (vLevel);                                          // abs(vLevel3,vLevel2,vLevel1,vLevel0)
402
0
          __m128i vdeltaU0 = _mm_mul_epu32(vLevel,vQuantCoeff);                     // Tmp2,Tmp0
403
0
          __m128i vdeltaU1 = _mm_srli_si128(vLevel,4);                             // abs(0,vLevel3,vLevel2,vLevel1)
404
0
          vdeltaU1 = _mm_mul_epu32(vdeltaU1,vQuantCoeff);                           // Tmp3,Tmp1
405
0
          __m128i vTmpLevel_0 = _mm_add_epi64(vdeltaU0,vAdd);
406
0
          __m128i vTmpLevel_1 = _mm_add_epi64(vdeltaU1,vAdd);
407
0
          const __m128i vQBits = _mm_cvtsi32_si128(iQBits);
408
0
          vTmpLevel_0 = _mm_srl_epi64(vTmpLevel_0,vQBits);                         // Int32 Tmp2,Tmp0
409
0
          vTmpLevel_1 = _mm_srl_epi64(vTmpLevel_1,vQBits);                         // Int32 Tmp3,Tmp1
410
0
          if (signHiding)
411
0
          {
412
0
            __m128i vBS0 = _mm_sll_epi64(vTmpLevel_0,vQBits);
413
0
            __m128i vBS1 = _mm_sll_epi64(vTmpLevel_1,vQBits);
414
0
            vdeltaU0 = _mm_sub_epi64(vdeltaU0,vBS0);
415
0
            vdeltaU1 = _mm_sub_epi64(vdeltaU1,vBS1);
416
0
            const __m128i vQBits8 = _mm_cvtsi32_si128(qBits8);
417
0
            vdeltaU0 = _mm_srl_epi64(vdeltaU0,vQBits8);
418
0
            vdeltaU1 = _mm_srl_epi64(vdeltaU1,vQBits8);
419
0
            vdeltaU0 = _mm_and_si128(vdeltaU0,vMask);
420
0
            vdeltaU1 = _mm_and_si128(vdeltaU1,vMask);
421
0
            vdeltaU1 = _mm_slli_epi64(vdeltaU1,32);
422
0
            vdeltaU0 = _mm_or_si128(vdeltaU0,vdeltaU1);
423
0
           _mm_storeu_si128( ( __m128i * ) &deltaU[uiBlockPos],vdeltaU0);
424
0
          }
425
0
          __m128i vquantMag0 =  _mm_and_si128(vTmpLevel_0,vMask);
426
0
          __m128i vquantMag1 =  _mm_and_si128(vTmpLevel_1,vMask);
427
0
          vquantMag1  = _mm_slli_epi64(vquantMag1,32);
428
0
          vTmpLevel_0 = _mm_or_si128(vquantMag0,vquantMag1);
429
0
          vAbsSum     = _mm_add_epi32(vAbsSum,vTmpLevel_0);
430
0
          vTmpLevel_1 = _mm_and_si128(vTmpLevel_0,vSign);                           // mask only neg values
431
0
          vTmpLevel_0 = _mm_andnot_si128(vSign,vTmpLevel_0);                        // mask only pos values
432
0
          vTmpLevel_0 = _mm_sub_epi32(vTmpLevel_0,vTmpLevel_1);
433
0
          vTmpLevel_0 = _mm_min_epi32(vMax, _mm_max_epi32(vMin,vTmpLevel_0));       // clip to 16 Bit
434
0
          vTmpLevel_0 = _mm_packs_epi32(vTmpLevel_0,vTmpLevel_0);
435
0
          _vv_storel_epi64( ( __m128i * ) &piQCoef.buf[uiBlockPos],vTmpLevel_0);
436
0
        }
437
0
      }
438
439
0
      vAbsSum = _mm_hadd_epi32( vAbsSum, vAbsSum );
440
0
      vAbsSum = _mm_hadd_epi32( vAbsSum, vAbsSum );
441
442
0
      uiAbsSum += _mm_cvtsi128_si32( vAbsSum );
443
0
    }
444
0
  }
445
0
  else
446
0
  {
447
0
    for( int currPos = 0; currPos <= iScanPos; currPos++ )
448
0
    {
449
0
      const int uiBlockPos  = cctx.blockPos( currPos );
450
0
      const TCoeff iLevel   = piCoef.buf[uiBlockPos];
451
0
      const TCoeff iSign    = (iLevel < 0 ? -1: 1);
452
0
      const int64_t  tmpLevel = (int64_t)abs(iLevel) * defaultQuantisationCoefficient;
453
0
      const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
454
0
      if (signHiding)
455
0
      {
456
0
        deltaU[uiBlockPos] = (TCoeff)((tmpLevel - ((int64_t)quantisedMagnitude<<iQBits) )>> qBits8);
457
0
      }
458
0
      uiAbsSum += quantisedMagnitude;
459
0
      const TCoeff quantisedCoefficient = quantisedMagnitude * iSign;
460
0
      piQCoef.buf[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
461
0
    } // for n
462
0
  }
463
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvenc::QuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::TransformUnit, vvenc::ComponentID, vvenc::AreaBuf<int const> const&, vvenc::AreaBuf<short>, int&, int&, int*, int, int, long, int, int, bool, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvenc::QuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::TransformUnit, vvenc::ComponentID, vvenc::AreaBuf<int const> const&, vvenc::AreaBuf<short>, int&, int&, int*, int, int, long, int, int, bool, int)
464
465
template<X86_VEXT vext>
466
static bool NeedRdoqSIMD( const TCoeff* pCoeff, size_t numCoeff, int quantCoeff, int64_t offset, int shift )
467
0
{
468
0
  const __m128i vshift = _mm_cvtsi32_si128( shift );
469
#if USE_AVX2
470
0
  if( vext >= AVX2 && ( numCoeff & 15 ) == 0 )
471
0
  {
472
0
    __m256i xqnt = _mm256_set1_epi32( quantCoeff );
473
0
    __m256i xoff = _mm256_set1_epi64x( offset );
474
475
0
    for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos += 16 )
476
0
    {
477
0
      __m256i xcff  = _mm256_loadu_si256( ( const __m256i* ) &pCoeff[uiBlockPos] );
478
0
              xcff  = _mm256_abs_epi32( xcff );
479
480
0
      __m256i xlvl1 = _mm256_mul_epi32( xcff, xqnt );
481
0
              xcff  = _mm256_shuffle_epi32( xcff, 1 + ( 3 << 4 ) );
482
0
      __m256i xlvl2 = _mm256_mul_epi32( xcff, xqnt );
483
0
              xlvl1 = _mm256_add_epi64( xlvl1, xoff );
484
0
              xlvl2 = _mm256_add_epi64( xlvl2, xoff );
485
0
              xlvl1 = _mm256_srl_epi64( xlvl1, vshift );
486
0
              xlvl2 = _mm256_srl_epi64( xlvl2, vshift );
487
488
0
      __m256i xany  = _mm256_or_si256( xlvl1, xlvl2 );
489
      
490
0
              xcff  = _mm256_loadu_si256( ( const __m256i* ) &pCoeff[uiBlockPos + 8] );
491
0
              xcff  = _mm256_abs_epi32( xcff );
492
              
493
0
              xlvl1 = _mm256_mul_epi32( xcff, xqnt );
494
0
              xcff  = _mm256_shuffle_epi32( xcff, 1 + ( 3 << 4 ) );
495
0
              xlvl2 = _mm256_mul_epi32( xcff, xqnt );
496
0
              xlvl1 = _mm256_add_epi64( xlvl1, xoff );
497
0
              xlvl2 = _mm256_add_epi64( xlvl2, xoff );
498
0
              xlvl1 = _mm256_srl_epi64( xlvl1, vshift );
499
0
              xlvl2 = _mm256_srl_epi64( xlvl2, vshift );
500
501
0
              xany  = _mm256_or_si256( xany, _mm256_or_si256( xlvl1, xlvl2 ) );
502
503
0
      if( !_mm256_testz_si256( xany, xany ) )
504
0
      {
505
0
        return true;
506
0
      }
507
0
    }
508
0
    return false;
509
0
  }
510
0
  else if( vext >= AVX2 && ( numCoeff & 7 ) == 0 )
511
0
  {
512
0
    __m256i xqnt = _mm256_set1_epi32( quantCoeff );
513
0
    __m256i xoff = _mm256_set1_epi64x( offset );
514
515
0
    for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos += 8 )
516
0
    {
517
0
      __m256i xcff  = _mm256_loadu_si256( ( const __m256i* ) &pCoeff[uiBlockPos] );
518
0
              xcff  = _mm256_abs_epi32( xcff );
519
520
0
      __m256i xlvl1 = _mm256_mul_epi32( xcff, xqnt );
521
0
              xcff  = _mm256_shuffle_epi32( xcff, 1 + ( 3 << 4 ) );
522
0
      __m256i xlvl2 = _mm256_mul_epi32( xcff, xqnt );
523
0
              xlvl1 = _mm256_add_epi64( xlvl1, xoff );
524
0
              xlvl2 = _mm256_add_epi64( xlvl2, xoff );
525
0
              xlvl1 = _mm256_srl_epi64( xlvl1, vshift );
526
0
              xlvl2 = _mm256_srl_epi64( xlvl2, vshift );
527
528
0
      __m256i xany  = _mm256_or_si256( xlvl1, xlvl2 );
529
530
0
      if( !_mm256_testz_si256( xany, xany ) )
531
0
      {
532
0
        return true;
533
0
      }
534
0
    }
535
0
    return false;
536
0
  }
537
0
  else
538
0
#endif
539
0
  if( ( numCoeff & 3 ) == 0 )
540
0
  {
541
0
    __m128i xqnt = _mm_set1_epi32( quantCoeff );
542
0
    __m128i xoff = _mm_set1_epi64x( offset );
543
544
0
    for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos += 4 )
545
0
    {
546
0
      __m128i xcff  = _mm_loadu_si128( ( const __m128i* ) &pCoeff[uiBlockPos] );
547
0
              xcff  = _mm_abs_epi32( xcff );
548
549
0
      __m128i xlvl1 = _mm_mul_epi32( xcff, xqnt );
550
0
              xcff  = _mm_shuffle_epi32( xcff, 1 + ( 3 << 4 ) );
551
0
      __m128i xlvl2 = _mm_mul_epi32( xcff, xqnt );
552
0
              xlvl1 = _mm_add_epi64( xlvl1, xoff );
553
0
              xlvl2 = _mm_add_epi64( xlvl2, xoff );
554
0
              xlvl1 = _mm_srl_epi64( xlvl1, vshift );
555
0
              xlvl2 = _mm_srl_epi64( xlvl2, vshift );
556
557
0
      __m128i xany  = _mm_or_si128( xlvl1, xlvl2 );
558
559
0
      if( !_mm_test_all_zeros( xany, xany ) )
560
0
      {
561
0
        return true;
562
0
      }
563
0
    }
564
0
    return false;
565
0
  }
566
0
  else
567
0
  {
568
0
    for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos++ )
569
0
    {
570
0
      const TCoeff   iLevel = pCoeff[uiBlockPos];
571
0
      const int64_t  tmpLevel = ( int64_t ) std::abs( iLevel ) * quantCoeff;
572
0
      const TCoeff quantisedMagnitude = TCoeff( ( tmpLevel + offset ) >> shift );
573
574
0
      if( quantisedMagnitude != 0 )
575
0
      {
576
0
        return true;
577
0
      }
578
0
    } // for n
579
0
    return false;
580
0
  }
581
0
}
Unexecuted instantiation: Quant_sse41.cpp:bool vvenc::NeedRdoqSIMD<(vvenc::x86_simd::X86_VEXT)1>(int const*, unsigned long, int, long, int)
Unexecuted instantiation: Quant_avx2.cpp:bool vvenc::NeedRdoqSIMD<(vvenc::x86_simd::X86_VEXT)4>(int const*, unsigned long, int, long, int)
582
583
template<X86_VEXT vext>
584
void Quant::_initQuantX86()
585
0
{
586
0
  xDeQuant  = DeQuantCoreSIMD<vext>;
587
0
  xQuant    = QuantCoreSIMD  <vext>;
588
0
  xNeedRdoq = NeedRdoqSIMD   <vext>;
589
0
}
Unexecuted instantiation: void vvenc::Quant::_initQuantX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::Quant::_initQuantX86<(vvenc::x86_simd::X86_VEXT)4>()
590
template void Quant::_initQuantX86<SIMDX86>();
591
592
593
} // namespace vvenc
594
595
//! \}
596
597
#endif // TARGET_SIMD_X86
598