Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/QuantX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or
4
other Intellectual Property Rights other than the copyrights concerning
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     QuantX86.h
44
    \brief    SIMD for Quant/Dequant
45
*/
46
47
#include "CommonLib/CommonDef.h"
48
#include "CommonDefX86.h"
49
#include "CommonLib/Quant.h"
50
51
namespace vvdec
52
{
53
54
#if ENABLE_SIMD_OPT_QUANT
55
#ifdef TARGET_SIMD_X86
56
57
template<X86_VEXT vext, class T, bool UseScalingList, bool RightShiftPositive>
58
static inline void DeQuantImplSIMD( const SizeType width,
59
                                    const int      maxX,
60
                                    const int      maxY,
61
                                    const int      scaleQP,
62
                                    const int*     piDequantCoef,   // unused if UseScalingList == false
63
                                    const T* const piQCoef,
64
                                    const size_t   piQCfStride,
65
                                    TCoeff* const  piCoef,
66
                                    const int      rightShift,
67
                                    const int      inputMaximum,
68
                                    const TCoeff   transformMaximum )
69
59.8k
{
70
59.8k
  static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" );
71
72
59.8k
  constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t );
73
74
59.8k
  const int    inputMinimum     = -( inputMaximum + 1 );
75
59.8k
  const TCoeff transformMinimum = -( transformMaximum + 1 );
76
77
59.8k
  const int iAdd  = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0;
78
59.8k
  const int shift = RightShiftPositive ? rightShift : -rightShift;
79
80
59.8k
  const __m128i vInputMin     = _mm_set1_epi32( inputMinimum );
81
59.8k
  const __m128i vInputMax     = _mm_set1_epi32( inputMaximum );
82
59.8k
  const __m128i vTransformMin = _mm_set1_epi32( transformMinimum );
83
59.8k
  const __m128i vTransformMax = _mm_set1_epi32( transformMaximum );
84
85
59.8k
  const __m128i vAdd   = _mm_set1_epi32( iAdd );
86
59.8k
  const __m128i vShift = _mm_set_epi64x( 0, shift );
87
59.8k
        __m128i vScale = _mm_set1_epi32( scaleQP );
88
89
#if USE_AVX2
90
  const __m256i xvInputMin     = _mm256_set1_epi32( inputMinimum );
91
  const __m256i xvInputMax     = _mm256_set1_epi32( inputMaximum );
92
  const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum );
93
  const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum );
94
95
  const __m256i xvAdd   = _mm256_set1_epi32( iAdd );
96
        __m256i xvScale = _mm256_set1_epi32( scaleQP );
97
#endif   // USE_AVX2
98
99
59.8k
  const int endX       = maxX + 1;
100
59.8k
  const int maskCoeffs = endX & 3;   // number of coefficients in the last vector read
101
  // clang-format off
102
59.8k
  const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) :
103
59.8k
                      ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 )
104
59.7k
                                        : _mm_set_epi32( 0, 0, 0, -1 ) );
105
  // clang-format on
106
107
425k
  for( int y = 0; y <= maxY; y++ )
108
365k
  {
109
365k
    int x = 0;
110
365k
    int n = y * width;
111
112
#if USE_AVX2
113
526k
    for( ; x + 7 < endX; x += 8, n += 8 )
114
160k
    {
115
160k
      if( UseScalingList )
116
0
      {
117
0
        xvScale = _mm256_set1_epi32( scaleQP );
118
0
        xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) );
119
0
      }
120
121
160k
      __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
122
32.3k
                                    : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] );                        // 32 bit coeffs
123
124
      xvLevel = _mm256_max_epi32( xvLevel, xvInputMin );
125
      xvLevel = _mm256_min_epi32( xvLevel, xvInputMax );
126
127
      xvLevel = _mm256_mullo_epi32( xvLevel, xvScale );
128
160k
      if( RightShiftPositive )
129
63.9k
      {
130
63.9k
        xvLevel = _mm256_add_epi32( xvLevel, xvAdd );
131
63.9k
        xvLevel = _mm256_sra_epi32( xvLevel, vShift );
132
63.9k
      }
133
96.8k
      else
134
96.8k
      {
135
96.8k
        xvLevel = _mm256_sll_epi32( xvLevel, vShift );
136
96.8k
      }
137
138
      xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin );
139
      xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax );
140
141
160k
      _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel );
142
160k
    }
143
#endif   // USE_AVX2
144
145
624k
    for( ; x + 3 < endX; x += 4, n += 4 )
146
258k
    {
147
258k
      if( UseScalingList )
148
0
      {
149
0
        vScale = _mm_set1_epi32( scaleQP );
150
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
151
0
      }
152
153
258k
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
154
258k
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
155
156
258k
      vLevel = _mm_max_epi32( vLevel, vInputMin );
157
258k
      vLevel = _mm_min_epi32( vLevel, vInputMax );
158
159
258k
      vLevel = _mm_mullo_epi32( vLevel, vScale );
160
258k
      if( RightShiftPositive )
161
27.0k
      {
162
27.0k
        vLevel = _mm_add_epi32( vLevel, vAdd );
163
27.0k
        vLevel = _mm_sra_epi32( vLevel, vShift );
164
27.0k
      }
165
231k
      else
166
231k
      {
167
231k
        vLevel = _mm_sll_epi32( vLevel, vShift );
168
231k
      }
169
170
258k
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
171
258k
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
172
173
258k
      _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
174
258k
    }
175
176
#if 0   // dequant remaining coefficients using scalar code
177
    (void)vMask;
178
    for( ; x < endX; x++, n++ )
179
    {
180
      const TCoeff level = piQCoef[x + y * piQCfStride];
181
      if( !level )
182
      {
183
        continue;
184
      }
185
186
      const int        scale     = UseScalingList ? piDequantCoef[n] * scaleQP   //
187
                                                  : scaleQP;
188
      const TCoeff     clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) );
189
      Intermediate_Int iCoeffQ   = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift   //
190
                                                      : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift );
191
192
      piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) );
193
    }
194
195
#else   // dequant remaining coefficients using SSE
196
197
365k
    if( x < endX )
198
1.80k
    {
199
1.80k
      CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs );
200
201
1.80k
      if( UseScalingList )
202
0
      {
203
0
        vScale = _mm_set1_epi32( scaleQP );
204
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
205
0
        vScale = _mm_and_si128( vScale, vMask );
206
0
      }
207
208
1.80k
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
209
1.80k
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
210
211
1.80k
      vLevel = _mm_and_si128( vLevel, vMask );
212
213
1.80k
      vLevel = _mm_max_epi32( vLevel, vInputMin );
214
1.80k
      vLevel = _mm_min_epi32( vLevel, vInputMax );
215
216
1.80k
      vLevel = _mm_mullo_epi32( vLevel, vScale );
217
1.80k
      if( RightShiftPositive )
218
1.71k
      {
219
1.71k
        vLevel = _mm_add_epi32( vLevel, vAdd );
220
1.71k
        vLevel = _mm_sra_epi32( vLevel, vShift );
221
1.71k
      }
222
90
      else
223
90
      {
224
90
        vLevel = _mm_sll_epi32( vLevel, vShift );
225
90
      }
226
227
1.80k
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
228
1.80k
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
229
230
1.80k
      if( maskCoeffs <= 2 )
231
664
      {
232
664
        _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel );
233
664
      }
234
1.14k
      else
235
1.14k
      {
236
1.14k
        _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
237
1.14k
      }
238
1.80k
    }
239
365k
#endif
240
365k
  }
241
59.8k
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, false, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, false, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, false, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, false, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, true, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, true, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, true, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, true, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, false, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Line
Count
Source
69
5.97k
{
70
5.97k
  static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" );
71
72
5.97k
  constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t );
73
74
5.97k
  const int    inputMinimum     = -( inputMaximum + 1 );
75
5.97k
  const TCoeff transformMinimum = -( transformMaximum + 1 );
76
77
5.97k
  const int iAdd  = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0;
78
5.97k
  const int shift = RightShiftPositive ? rightShift : -rightShift;
79
80
5.97k
  const __m128i vInputMin     = _mm_set1_epi32( inputMinimum );
81
5.97k
  const __m128i vInputMax     = _mm_set1_epi32( inputMaximum );
82
5.97k
  const __m128i vTransformMin = _mm_set1_epi32( transformMinimum );
83
5.97k
  const __m128i vTransformMax = _mm_set1_epi32( transformMaximum );
84
85
5.97k
  const __m128i vAdd   = _mm_set1_epi32( iAdd );
86
5.97k
  const __m128i vShift = _mm_set_epi64x( 0, shift );
87
5.97k
        __m128i vScale = _mm_set1_epi32( scaleQP );
88
89
5.97k
#if USE_AVX2
90
5.97k
  const __m256i xvInputMin     = _mm256_set1_epi32( inputMinimum );
91
5.97k
  const __m256i xvInputMax     = _mm256_set1_epi32( inputMaximum );
92
5.97k
  const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum );
93
5.97k
  const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum );
94
95
5.97k
  const __m256i xvAdd   = _mm256_set1_epi32( iAdd );
96
5.97k
        __m256i xvScale = _mm256_set1_epi32( scaleQP );
97
5.97k
#endif   // USE_AVX2
98
99
5.97k
  const int endX       = maxX + 1;
100
5.97k
  const int maskCoeffs = endX & 3;   // number of coefficients in the last vector read
101
  // clang-format off
102
5.97k
  const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) :
103
5.97k
                      ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 )
104
5.82k
                                        : _mm_set_epi32( 0, 0, 0, -1 ) );
105
  // clang-format on
106
107
46.6k
  for( int y = 0; y <= maxY; y++ )
108
40.6k
  {
109
40.6k
    int x = 0;
110
40.6k
    int n = y * width;
111
112
40.6k
#if USE_AVX2
113
75.1k
    for( ; x + 7 < endX; x += 8, n += 8 )
114
34.5k
    {
115
34.5k
      if( UseScalingList )
116
0
      {
117
0
        xvScale = _mm256_set1_epi32( scaleQP );
118
0
        xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) );
119
0
      }
120
121
34.5k
      __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
122
34.5k
                                    : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] );                        // 32 bit coeffs
123
124
34.5k
      xvLevel = _mm256_max_epi32( xvLevel, xvInputMin );
125
34.5k
      xvLevel = _mm256_min_epi32( xvLevel, xvInputMax );
126
127
34.5k
      xvLevel = _mm256_mullo_epi32( xvLevel, xvScale );
128
34.5k
      if( RightShiftPositive )
129
34.5k
      {
130
34.5k
        xvLevel = _mm256_add_epi32( xvLevel, xvAdd );
131
34.5k
        xvLevel = _mm256_sra_epi32( xvLevel, vShift );
132
34.5k
      }
133
0
      else
134
0
      {
135
0
        xvLevel = _mm256_sll_epi32( xvLevel, vShift );
136
0
      }
137
138
34.5k
      xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin );
139
34.5k
      xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax );
140
141
34.5k
      _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel );
142
34.5k
    }
143
40.6k
#endif   // USE_AVX2
144
145
64.0k
    for( ; x + 3 < endX; x += 4, n += 4 )
146
23.4k
    {
147
23.4k
      if( UseScalingList )
148
0
      {
149
0
        vScale = _mm_set1_epi32( scaleQP );
150
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
151
0
      }
152
153
23.4k
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
154
23.4k
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
155
156
23.4k
      vLevel = _mm_max_epi32( vLevel, vInputMin );
157
23.4k
      vLevel = _mm_min_epi32( vLevel, vInputMax );
158
159
23.4k
      vLevel = _mm_mullo_epi32( vLevel, vScale );
160
23.4k
      if( RightShiftPositive )
161
23.4k
      {
162
23.4k
        vLevel = _mm_add_epi32( vLevel, vAdd );
163
23.4k
        vLevel = _mm_sra_epi32( vLevel, vShift );
164
23.4k
      }
165
0
      else
166
0
      {
167
0
        vLevel = _mm_sll_epi32( vLevel, vShift );
168
0
      }
169
170
23.4k
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
171
23.4k
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
172
173
23.4k
      _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
174
23.4k
    }
175
176
#if 0   // dequant remaining coefficients using scalar code
177
    (void)vMask;
178
    for( ; x < endX; x++, n++ )
179
    {
180
      const TCoeff level = piQCoef[x + y * piQCfStride];
181
      if( !level )
182
      {
183
        continue;
184
      }
185
186
      const int        scale     = UseScalingList ? piDequantCoef[n] * scaleQP   //
187
                                                  : scaleQP;
188
      const TCoeff     clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) );
189
      Intermediate_Int iCoeffQ   = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift   //
190
                                                      : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift );
191
192
      piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) );
193
    }
194
195
#else   // dequant remaining coefficients using SSE
196
197
40.6k
    if( x < endX )
198
1.71k
    {
199
1.71k
      CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs );
200
201
1.71k
      if( UseScalingList )
202
0
      {
203
0
        vScale = _mm_set1_epi32( scaleQP );
204
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
205
0
        vScale = _mm_and_si128( vScale, vMask );
206
0
      }
207
208
1.71k
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
209
1.71k
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
210
211
1.71k
      vLevel = _mm_and_si128( vLevel, vMask );
212
213
1.71k
      vLevel = _mm_max_epi32( vLevel, vInputMin );
214
1.71k
      vLevel = _mm_min_epi32( vLevel, vInputMax );
215
216
1.71k
      vLevel = _mm_mullo_epi32( vLevel, vScale );
217
1.71k
      if( RightShiftPositive )
218
1.71k
      {
219
1.71k
        vLevel = _mm_add_epi32( vLevel, vAdd );
220
1.71k
        vLevel = _mm_sra_epi32( vLevel, vShift );
221
1.71k
      }
222
0
      else
223
0
      {
224
0
        vLevel = _mm_sll_epi32( vLevel, vShift );
225
0
      }
226
227
1.71k
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
228
1.71k
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
229
230
1.71k
      if( maskCoeffs <= 2 )
231
627
      {
232
627
        _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel );
233
627
      }
234
1.09k
      else
235
1.09k
      {
236
1.09k
        _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
237
1.09k
      }
238
1.71k
    }
239
40.6k
#endif
240
40.6k
  }
241
5.97k
}
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, false, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Line
Count
Source
69
51.6k
{
70
51.6k
  static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" );
71
72
51.6k
  constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t );
73
74
51.6k
  const int    inputMinimum     = -( inputMaximum + 1 );
75
51.6k
  const TCoeff transformMinimum = -( transformMaximum + 1 );
76
77
51.6k
  const int iAdd  = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0;
78
51.6k
  const int shift = RightShiftPositive ? rightShift : -rightShift;
79
80
51.6k
  const __m128i vInputMin     = _mm_set1_epi32( inputMinimum );
81
51.6k
  const __m128i vInputMax     = _mm_set1_epi32( inputMaximum );
82
51.6k
  const __m128i vTransformMin = _mm_set1_epi32( transformMinimum );
83
51.6k
  const __m128i vTransformMax = _mm_set1_epi32( transformMaximum );
84
85
51.6k
  const __m128i vAdd   = _mm_set1_epi32( iAdd );
86
51.6k
  const __m128i vShift = _mm_set_epi64x( 0, shift );
87
51.6k
        __m128i vScale = _mm_set1_epi32( scaleQP );
88
89
51.6k
#if USE_AVX2
90
51.6k
  const __m256i xvInputMin     = _mm256_set1_epi32( inputMinimum );
91
51.6k
  const __m256i xvInputMax     = _mm256_set1_epi32( inputMaximum );
92
51.6k
  const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum );
93
51.6k
  const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum );
94
95
51.6k
  const __m256i xvAdd   = _mm256_set1_epi32( iAdd );
96
51.6k
        __m256i xvScale = _mm256_set1_epi32( scaleQP );
97
51.6k
#endif   // USE_AVX2
98
99
51.6k
  const int endX       = maxX + 1;
100
51.6k
  const int maskCoeffs = endX & 3;   // number of coefficients in the last vector read
101
  // clang-format off
102
51.6k
  const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) :
103
51.6k
                      ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 )
104
51.5k
                                        : _mm_set_epi32( 0, 0, 0, -1 ) );
105
  // clang-format on
106
107
352k
  for( int y = 0; y <= maxY; y++ )
108
300k
  {
109
300k
    int x = 0;
110
300k
    int n = y * width;
111
112
300k
#if USE_AVX2
113
394k
    for( ; x + 7 < endX; x += 8, n += 8 )
114
93.9k
    {
115
93.9k
      if( UseScalingList )
116
0
      {
117
0
        xvScale = _mm256_set1_epi32( scaleQP );
118
0
        xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) );
119
0
      }
120
121
93.9k
      __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
122
93.9k
                                    : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] );                        // 32 bit coeffs
123
124
93.9k
      xvLevel = _mm256_max_epi32( xvLevel, xvInputMin );
125
93.9k
      xvLevel = _mm256_min_epi32( xvLevel, xvInputMax );
126
127
93.9k
      xvLevel = _mm256_mullo_epi32( xvLevel, xvScale );
128
93.9k
      if( RightShiftPositive )
129
0
      {
130
0
        xvLevel = _mm256_add_epi32( xvLevel, xvAdd );
131
0
        xvLevel = _mm256_sra_epi32( xvLevel, vShift );
132
0
      }
133
93.9k
      else
134
93.9k
      {
135
93.9k
        xvLevel = _mm256_sll_epi32( xvLevel, vShift );
136
93.9k
      }
137
138
93.9k
      xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin );
139
93.9k
      xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax );
140
141
93.9k
      _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel );
142
93.9k
    }
143
300k
#endif   // USE_AVX2
144
145
531k
    for( ; x + 3 < endX; x += 4, n += 4 )
146
231k
    {
147
231k
      if( UseScalingList )
148
0
      {
149
0
        vScale = _mm_set1_epi32( scaleQP );
150
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
151
0
      }
152
153
231k
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
154
231k
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
155
156
231k
      vLevel = _mm_max_epi32( vLevel, vInputMin );
157
231k
      vLevel = _mm_min_epi32( vLevel, vInputMax );
158
159
231k
      vLevel = _mm_mullo_epi32( vLevel, vScale );
160
231k
      if( RightShiftPositive )
161
0
      {
162
0
        vLevel = _mm_add_epi32( vLevel, vAdd );
163
0
        vLevel = _mm_sra_epi32( vLevel, vShift );
164
0
      }
165
231k
      else
166
231k
      {
167
231k
        vLevel = _mm_sll_epi32( vLevel, vShift );
168
231k
      }
169
170
231k
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
171
231k
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
172
173
231k
      _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
174
231k
    }
175
176
#if 0   // dequant remaining coefficients using scalar code
177
    (void)vMask;
178
    for( ; x < endX; x++, n++ )
179
    {
180
      const TCoeff level = piQCoef[x + y * piQCfStride];
181
      if( !level )
182
      {
183
        continue;
184
      }
185
186
      const int        scale     = UseScalingList ? piDequantCoef[n] * scaleQP   //
187
                                                  : scaleQP;
188
      const TCoeff     clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) );
189
      Intermediate_Int iCoeffQ   = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift   //
190
                                                      : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift );
191
192
      piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) );
193
    }
194
195
#else   // dequant remaining coefficients using SSE
196
197
300k
    if( x < endX )
198
90
    {
199
90
      CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs );
200
201
90
      if( UseScalingList )
202
0
      {
203
0
        vScale = _mm_set1_epi32( scaleQP );
204
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
205
0
        vScale = _mm_and_si128( vScale, vMask );
206
0
      }
207
208
90
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
209
90
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
210
211
90
      vLevel = _mm_and_si128( vLevel, vMask );
212
213
90
      vLevel = _mm_max_epi32( vLevel, vInputMin );
214
90
      vLevel = _mm_min_epi32( vLevel, vInputMax );
215
216
90
      vLevel = _mm_mullo_epi32( vLevel, vScale );
217
90
      if( RightShiftPositive )
218
0
      {
219
0
        vLevel = _mm_add_epi32( vLevel, vAdd );
220
0
        vLevel = _mm_sra_epi32( vLevel, vShift );
221
0
      }
222
90
      else
223
90
      {
224
90
        vLevel = _mm_sll_epi32( vLevel, vShift );
225
90
      }
226
227
90
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
228
90
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
229
230
90
      if( maskCoeffs <= 2 )
231
37
      {
232
37
        _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel );
233
37
      }
234
53
      else
235
53
      {
236
53
        _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
237
53
      }
238
90
    }
239
300k
#endif
240
300k
  }
241
51.6k
}
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, false, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Line
Count
Source
69
2.10k
{
70
2.10k
  static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" );
71
72
2.10k
  constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t );
73
74
2.10k
  const int    inputMinimum     = -( inputMaximum + 1 );
75
2.10k
  const TCoeff transformMinimum = -( transformMaximum + 1 );
76
77
2.10k
  const int iAdd  = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0;
78
2.10k
  const int shift = RightShiftPositive ? rightShift : -rightShift;
79
80
2.10k
  const __m128i vInputMin     = _mm_set1_epi32( inputMinimum );
81
2.10k
  const __m128i vInputMax     = _mm_set1_epi32( inputMaximum );
82
2.10k
  const __m128i vTransformMin = _mm_set1_epi32( transformMinimum );
83
2.10k
  const __m128i vTransformMax = _mm_set1_epi32( transformMaximum );
84
85
2.10k
  const __m128i vAdd   = _mm_set1_epi32( iAdd );
86
2.10k
  const __m128i vShift = _mm_set_epi64x( 0, shift );
87
2.10k
        __m128i vScale = _mm_set1_epi32( scaleQP );
88
89
2.10k
#if USE_AVX2
90
2.10k
  const __m256i xvInputMin     = _mm256_set1_epi32( inputMinimum );
91
2.10k
  const __m256i xvInputMax     = _mm256_set1_epi32( inputMaximum );
92
2.10k
  const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum );
93
2.10k
  const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum );
94
95
2.10k
  const __m256i xvAdd   = _mm256_set1_epi32( iAdd );
96
2.10k
        __m256i xvScale = _mm256_set1_epi32( scaleQP );
97
2.10k
#endif   // USE_AVX2
98
99
2.10k
  const int endX       = maxX + 1;
100
2.10k
  const int maskCoeffs = endX & 3;   // number of coefficients in the last vector read
101
  // clang-format off
102
2.10k
  const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) :
103
2.10k
                      ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 )
104
2.10k
                                        : _mm_set_epi32( 0, 0, 0, -1 ) );
105
  // clang-format on
106
107
24.1k
  for( int y = 0; y <= maxY; y++ )
108
22.0k
  {
109
22.0k
    int x = 0;
110
22.0k
    int n = y * width;
111
112
22.0k
#if USE_AVX2
113
51.4k
    for( ; x + 7 < endX; x += 8, n += 8 )
114
29.4k
    {
115
29.4k
      if( UseScalingList )
116
0
      {
117
0
        xvScale = _mm256_set1_epi32( scaleQP );
118
0
        xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) );
119
0
      }
120
121
29.4k
      __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
122
29.4k
                                    : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] );                        // 32 bit coeffs
123
124
29.4k
      xvLevel = _mm256_max_epi32( xvLevel, xvInputMin );
125
29.4k
      xvLevel = _mm256_min_epi32( xvLevel, xvInputMax );
126
127
29.4k
      xvLevel = _mm256_mullo_epi32( xvLevel, xvScale );
128
29.4k
      if( RightShiftPositive )
129
29.4k
      {
130
29.4k
        xvLevel = _mm256_add_epi32( xvLevel, xvAdd );
131
29.4k
        xvLevel = _mm256_sra_epi32( xvLevel, vShift );
132
29.4k
      }
133
0
      else
134
0
      {
135
0
        xvLevel = _mm256_sll_epi32( xvLevel, vShift );
136
0
      }
137
138
29.4k
      xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin );
139
29.4k
      xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax );
140
141
29.4k
      _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel );
142
29.4k
    }
143
22.0k
#endif   // USE_AVX2
144
145
25.6k
    for( ; x + 3 < endX; x += 4, n += 4 )
146
3.62k
    {
147
3.62k
      if( UseScalingList )
148
0
      {
149
0
        vScale = _mm_set1_epi32( scaleQP );
150
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
151
0
      }
152
153
3.62k
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
154
3.62k
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
155
156
3.62k
      vLevel = _mm_max_epi32( vLevel, vInputMin );
157
3.62k
      vLevel = _mm_min_epi32( vLevel, vInputMax );
158
159
3.62k
      vLevel = _mm_mullo_epi32( vLevel, vScale );
160
3.62k
      if( RightShiftPositive )
161
3.62k
      {
162
3.62k
        vLevel = _mm_add_epi32( vLevel, vAdd );
163
3.62k
        vLevel = _mm_sra_epi32( vLevel, vShift );
164
3.62k
      }
165
0
      else
166
0
      {
167
0
        vLevel = _mm_sll_epi32( vLevel, vShift );
168
0
      }
169
170
3.62k
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
171
3.62k
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
172
173
3.62k
      _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
174
3.62k
    }
175
176
#if 0   // dequant remaining coefficients using scalar code
177
    (void)vMask;
178
    for( ; x < endX; x++, n++ )
179
    {
180
      const TCoeff level = piQCoef[x + y * piQCfStride];
181
      if( !level )
182
      {
183
        continue;
184
      }
185
186
      const int        scale     = UseScalingList ? piDequantCoef[n] * scaleQP   //
187
                                                  : scaleQP;
188
      const TCoeff     clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) );
189
      Intermediate_Int iCoeffQ   = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift   //
190
                                                      : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift );
191
192
      piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) );
193
    }
194
195
#else   // dequant remaining coefficients using SSE
196
197
22.0k
    if( x < endX )
198
0
    {
199
0
      CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs );
200
201
0
      if( UseScalingList )
202
0
      {
203
0
        vScale = _mm_set1_epi32( scaleQP );
204
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
205
0
        vScale = _mm_and_si128( vScale, vMask );
206
0
      }
207
208
0
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
209
0
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
210
211
0
      vLevel = _mm_and_si128( vLevel, vMask );
212
213
0
      vLevel = _mm_max_epi32( vLevel, vInputMin );
214
0
      vLevel = _mm_min_epi32( vLevel, vInputMax );
215
216
0
      vLevel = _mm_mullo_epi32( vLevel, vScale );
217
0
      if( RightShiftPositive )
218
0
      {
219
0
        vLevel = _mm_add_epi32( vLevel, vAdd );
220
0
        vLevel = _mm_sra_epi32( vLevel, vShift );
221
0
      }
222
0
      else
223
0
      {
224
0
        vLevel = _mm_sll_epi32( vLevel, vShift );
225
0
      }
226
227
0
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
228
0
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
229
230
0
      if( maskCoeffs <= 2 )
231
0
      {
232
0
        _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel );
233
0
      }
234
0
      else
235
0
      {
236
0
        _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
237
0
      }
238
0
    }
239
22.0k
#endif
240
22.0k
  }
241
2.10k
}
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, false, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Line
Count
Source
69
198
{
70
198
  static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" );
71
72
198
  constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t );
73
74
198
  const int    inputMinimum     = -( inputMaximum + 1 );
75
198
  const TCoeff transformMinimum = -( transformMaximum + 1 );
76
77
198
  const int iAdd  = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0;
78
198
  const int shift = RightShiftPositive ? rightShift : -rightShift;
79
80
198
  const __m128i vInputMin     = _mm_set1_epi32( inputMinimum );
81
198
  const __m128i vInputMax     = _mm_set1_epi32( inputMaximum );
82
198
  const __m128i vTransformMin = _mm_set1_epi32( transformMinimum );
83
198
  const __m128i vTransformMax = _mm_set1_epi32( transformMaximum );
84
85
198
  const __m128i vAdd   = _mm_set1_epi32( iAdd );
86
198
  const __m128i vShift = _mm_set_epi64x( 0, shift );
87
198
        __m128i vScale = _mm_set1_epi32( scaleQP );
88
89
198
#if USE_AVX2
90
198
  const __m256i xvInputMin     = _mm256_set1_epi32( inputMinimum );
91
198
  const __m256i xvInputMax     = _mm256_set1_epi32( inputMaximum );
92
198
  const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum );
93
198
  const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum );
94
95
198
  const __m256i xvAdd   = _mm256_set1_epi32( iAdd );
96
198
        __m256i xvScale = _mm256_set1_epi32( scaleQP );
97
198
#endif   // USE_AVX2
98
99
198
  const int endX       = maxX + 1;
100
198
  const int maskCoeffs = endX & 3;   // number of coefficients in the last vector read
101
  // clang-format off
102
198
  const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) :
103
198
                      ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 )
104
198
                                        : _mm_set_epi32( 0, 0, 0, -1 ) );
105
  // clang-format on
106
107
2.30k
  for( int y = 0; y <= maxY; y++ )
108
2.10k
  {
109
2.10k
    int x = 0;
110
2.10k
    int n = y * width;
111
112
2.10k
#if USE_AVX2
113
5.00k
    for( ; x + 7 < endX; x += 8, n += 8 )
114
2.90k
    {
115
2.90k
      if( UseScalingList )
116
0
      {
117
0
        xvScale = _mm256_set1_epi32( scaleQP );
118
0
        xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) );
119
0
      }
120
121
2.90k
      __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
122
2.90k
                                    : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] );                        // 32 bit coeffs
123
124
2.90k
      xvLevel = _mm256_max_epi32( xvLevel, xvInputMin );
125
2.90k
      xvLevel = _mm256_min_epi32( xvLevel, xvInputMax );
126
127
2.90k
      xvLevel = _mm256_mullo_epi32( xvLevel, xvScale );
128
2.90k
      if( RightShiftPositive )
129
0
      {
130
0
        xvLevel = _mm256_add_epi32( xvLevel, xvAdd );
131
0
        xvLevel = _mm256_sra_epi32( xvLevel, vShift );
132
0
      }
133
2.90k
      else
134
2.90k
      {
135
2.90k
        xvLevel = _mm256_sll_epi32( xvLevel, vShift );
136
2.90k
      }
137
138
2.90k
      xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin );
139
2.90k
      xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax );
140
141
2.90k
      _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel );
142
2.90k
    }
143
2.10k
#endif   // USE_AVX2
144
145
2.47k
    for( ; x + 3 < endX; x += 4, n += 4 )
146
368
    {
147
368
      if( UseScalingList )
148
0
      {
149
0
        vScale = _mm_set1_epi32( scaleQP );
150
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
151
0
      }
152
153
368
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
154
368
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
155
156
368
      vLevel = _mm_max_epi32( vLevel, vInputMin );
157
368
      vLevel = _mm_min_epi32( vLevel, vInputMax );
158
159
368
      vLevel = _mm_mullo_epi32( vLevel, vScale );
160
368
      if( RightShiftPositive )
161
0
      {
162
0
        vLevel = _mm_add_epi32( vLevel, vAdd );
163
0
        vLevel = _mm_sra_epi32( vLevel, vShift );
164
0
      }
165
368
      else
166
368
      {
167
368
        vLevel = _mm_sll_epi32( vLevel, vShift );
168
368
      }
169
170
368
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
171
368
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
172
173
368
      _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
174
368
    }
175
176
#if 0   // dequant remaining coefficients using scalar code
177
    (void)vMask;
178
    for( ; x < endX; x++, n++ )
179
    {
180
      const TCoeff level = piQCoef[x + y * piQCfStride];
181
      if( !level )
182
      {
183
        continue;
184
      }
185
186
      const int        scale     = UseScalingList ? piDequantCoef[n] * scaleQP   //
187
                                                  : scaleQP;
188
      const TCoeff     clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) );
189
      Intermediate_Int iCoeffQ   = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift   //
190
                                                      : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift );
191
192
      piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) );
193
    }
194
195
#else   // dequant remaining coefficients using SSE
196
197
2.10k
    if( x < endX )
198
0
    {
199
0
      CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs );
200
201
0
      if( UseScalingList )
202
0
      {
203
0
        vScale = _mm_set1_epi32( scaleQP );
204
0
        vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) );
205
0
        vScale = _mm_and_si128( vScale, vMask );
206
0
      }
207
208
0
      __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) )   // 16 bit coeffs
209
0
                                   : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] );            // 32 bit coeffs
210
211
0
      vLevel = _mm_and_si128( vLevel, vMask );
212
213
0
      vLevel = _mm_max_epi32( vLevel, vInputMin );
214
0
      vLevel = _mm_min_epi32( vLevel, vInputMax );
215
216
0
      vLevel = _mm_mullo_epi32( vLevel, vScale );
217
0
      if( RightShiftPositive )
218
0
      {
219
0
        vLevel = _mm_add_epi32( vLevel, vAdd );
220
0
        vLevel = _mm_sra_epi32( vLevel, vShift );
221
0
      }
222
0
      else
223
0
      {
224
0
        vLevel = _mm_sll_epi32( vLevel, vShift );
225
0
      }
226
227
0
      vLevel = _mm_max_epi32( vLevel, vTransformMin );
228
0
      vLevel = _mm_min_epi32( vLevel, vTransformMax );
229
230
0
      if( maskCoeffs <= 2 )
231
0
      {
232
0
        _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel );
233
0
      }
234
0
      else
235
0
      {
236
0
        _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel );
237
0
      }
238
0
    }
239
2.10k
#endif
240
2.10k
  }
241
198
}
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, true, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, true, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, true, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, true, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
242
243
template<X86_VEXT vext, class T>
244
static void DeQuantCoreSIMD( const SizeType width,
245
                             const int      maxX,
246
                             const int      maxY,
247
                             const int      scale,
248
                             const T* const piQCoef,
249
                             const size_t   piQCfStride,
250
                             TCoeff* const  piCoef,
251
                             const int      rightShift,
252
                             const int      inputMaximum,
253
                             const TCoeff   transformMaximum )
254
79.9k
{
255
79.9k
  if( maxX < 2 )
256
20.1k
  {
257
20.1k
    Quant::DeQuantCore<T>(width,
258
20.1k
                   maxX,
259
20.1k
                   maxY,
260
20.1k
                   scale,
261
20.1k
                   piQCoef,
262
20.1k
                   piQCfStride,
263
20.1k
                   piCoef,
264
20.1k
                   rightShift,
265
20.1k
                   inputMaximum,
266
20.1k
                   transformMaximum );
267
20.1k
  }
268
59.8k
  else if( rightShift > 0 )
269
8.07k
  {
270
8.07k
    DeQuantImplSIMD<vext, T, false, true>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
271
8.07k
  }
272
51.7k
  else
273
51.7k
  {
274
51.7k
    DeQuantImplSIMD<vext, T, false, false>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
275
51.7k
  }
276
79.9k
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, short>(unsigned int, int, int, int, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, int>(unsigned int, int, int, int, int const*, unsigned long, int*, int, int, int)
Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, short>(unsigned int, int, int, int, short const*, unsigned long, int*, int, int, int)
Line
Count
Source
254
77.6k
{
255
77.6k
  if( maxX < 2 )
256
20.1k
  {
257
20.1k
    Quant::DeQuantCore<T>(width,
258
20.1k
                   maxX,
259
20.1k
                   maxY,
260
20.1k
                   scale,
261
20.1k
                   piQCoef,
262
20.1k
                   piQCfStride,
263
20.1k
                   piCoef,
264
20.1k
                   rightShift,
265
20.1k
                   inputMaximum,
266
20.1k
                   transformMaximum );
267
20.1k
  }
268
57.5k
  else if( rightShift > 0 )
269
5.97k
  {
270
5.97k
    DeQuantImplSIMD<vext, T, false, true>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
271
5.97k
  }
272
51.5k
  else
273
51.5k
  {
274
51.5k
    DeQuantImplSIMD<vext, T, false, false>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
275
51.5k
  }
276
77.6k
}
Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, int>(unsigned int, int, int, int, int const*, unsigned long, int*, int, int, int)
Line
Count
Source
254
2.29k
{
255
2.29k
  if( maxX < 2 )
256
0
  {
257
0
    Quant::DeQuantCore<T>(width,
258
0
                   maxX,
259
0
                   maxY,
260
0
                   scale,
261
0
                   piQCoef,
262
0
                   piQCfStride,
263
0
                   piCoef,
264
0
                   rightShift,
265
0
                   inputMaximum,
266
0
                   transformMaximum );
267
0
  }
268
2.29k
  else if( rightShift > 0 )
269
2.10k
  {
270
2.10k
    DeQuantImplSIMD<vext, T, false, true>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
271
2.10k
  }
272
198
  else
273
198
  {
274
198
    DeQuantImplSIMD<vext, T, false, false>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
275
198
  }
276
2.29k
}
277
278
template<X86_VEXT vext, class T>
279
static void DeQuantScalingCoreSIMD( const SizeType width,
280
                                    const int      maxX,
281
                                    const int      maxY,
282
                                    const int      scaleQP,
283
                                    const int*     piDequantCoef,
284
                                    const T* const piQCoef,
285
                                    const size_t   piQCfStride,
286
                                    TCoeff* const  piCoef,
287
                                    const int      rightShift,
288
                                    const int      inputMaximum,
289
                                    const TCoeff   transformMaximum )
290
0
{
291
0
  if( maxX < 2 )
292
0
  {
293
0
    Quant::DeQuantScalingCore<T>(width,
294
0
                          maxX,
295
0
                          maxY,
296
0
                          scaleQP,
297
0
                          piDequantCoef,
298
0
                          piQCoef,
299
0
                          piQCfStride,
300
0
                          piCoef,
301
0
                          rightShift,
302
0
                          inputMaximum,
303
0
                          transformMaximum );
304
0
  }
305
0
  else if( rightShift > 0 )
306
0
  {
307
0
    DeQuantImplSIMD<vext, T, true, true>( width, maxX, maxY, scaleQP, piDequantCoef, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
308
0
  }
309
0
  else
310
0
  {
311
0
    DeQuantImplSIMD<vext, T, true, false>( width, maxX, maxY, scaleQP, piDequantCoef, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum );
312
0
  }
313
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, short>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, int>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, short>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, int>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int)
314
315
template<X86_VEXT vext>
316
void Quant::_initQuantX86()
317
59.8k
{
318
59.8k
  DeQuant           = DeQuantCoreSIMD<vext, TCoeffSig>;
319
59.8k
  DeQuantPCM        = DeQuantCoreSIMD<vext, TCoeff>;
320
59.8k
  DeQuantScaling    = DeQuantScalingCoreSIMD<vext, TCoeffSig>;
321
59.8k
  DeQuantScalingPCM = DeQuantScalingCoreSIMD<vext, TCoeff>;
322
59.8k
}
Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
317
59.8k
{
318
59.8k
  DeQuant           = DeQuantCoreSIMD<vext, TCoeffSig>;
319
59.8k
  DeQuantPCM        = DeQuantCoreSIMD<vext, TCoeff>;
320
59.8k
  DeQuantScaling    = DeQuantScalingCoreSIMD<vext, TCoeffSig>;
321
59.8k
  DeQuantScalingPCM = DeQuantScalingCoreSIMD<vext, TCoeff>;
322
59.8k
}
323
template void Quant::_initQuantX86<SIMDX86>();
324
325
#endif  // TARGET_SIMD_X86
326
#endif  // ENABLE_SIMD_OPT_QUANT
327
328
}   // namespace vvdec