/src/vvdec/source/Lib/CommonLib/x86/QuantX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | /** \file QuantX86.h |
44 | | \brief SIMD for Quant/Dequant |
45 | | */ |
46 | | |
47 | | #include "CommonLib/CommonDef.h" |
48 | | #include "CommonDefX86.h" |
49 | | #include "CommonLib/Quant.h" |
50 | | |
51 | | namespace vvdec |
52 | | { |
53 | | |
54 | | #if ENABLE_SIMD_OPT_QUANT |
55 | | #ifdef TARGET_SIMD_X86 |
56 | | |
57 | | template<X86_VEXT vext, class T, bool UseScalingList, bool RightShiftPositive> |
58 | | static inline void DeQuantImplSIMD( const SizeType width, |
59 | | const int maxX, |
60 | | const int maxY, |
61 | | const int scaleQP, |
62 | | const int* piDequantCoef, // unused if UseScalingList == false |
63 | | const T* const piQCoef, |
64 | | const size_t piQCfStride, |
65 | | TCoeff* const piCoef, |
66 | | const int rightShift, |
67 | | const int inputMaximum, |
68 | | const TCoeff transformMaximum ) |
69 | 59.8k | { |
70 | 59.8k | static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" ); |
71 | | |
72 | 59.8k | constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t ); |
73 | | |
74 | 59.8k | const int inputMinimum = -( inputMaximum + 1 ); |
75 | 59.8k | const TCoeff transformMinimum = -( transformMaximum + 1 ); |
76 | | |
77 | 59.8k | const int iAdd = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0; |
78 | 59.8k | const int shift = RightShiftPositive ? rightShift : -rightShift; |
79 | | |
80 | 59.8k | const __m128i vInputMin = _mm_set1_epi32( inputMinimum ); |
81 | 59.8k | const __m128i vInputMax = _mm_set1_epi32( inputMaximum ); |
82 | 59.8k | const __m128i vTransformMin = _mm_set1_epi32( transformMinimum ); |
83 | 59.8k | const __m128i vTransformMax = _mm_set1_epi32( transformMaximum ); |
84 | | |
85 | 59.8k | const __m128i vAdd = _mm_set1_epi32( iAdd ); |
86 | 59.8k | const __m128i vShift = _mm_set_epi64x( 0, shift ); |
87 | 59.8k | __m128i vScale = _mm_set1_epi32( scaleQP ); |
88 | | |
89 | | #if USE_AVX2 |
90 | | const __m256i xvInputMin = _mm256_set1_epi32( inputMinimum ); |
91 | | const __m256i xvInputMax = _mm256_set1_epi32( inputMaximum ); |
92 | | const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum ); |
93 | | const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum ); |
94 | | |
95 | | const __m256i xvAdd = _mm256_set1_epi32( iAdd ); |
96 | | __m256i xvScale = _mm256_set1_epi32( scaleQP ); |
97 | | #endif // USE_AVX2 |
98 | | |
99 | 59.8k | const int endX = maxX + 1; |
100 | 59.8k | const int maskCoeffs = endX & 3; // number of coefficients in the last vector read |
101 | | // clang-format off |
102 | 59.8k | const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) : |
103 | 59.8k | ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 ) |
104 | 59.7k | : _mm_set_epi32( 0, 0, 0, -1 ) ); |
105 | | // clang-format on |
106 | | |
107 | 425k | for( int y = 0; y <= maxY; y++ ) |
108 | 365k | { |
109 | 365k | int x = 0; |
110 | 365k | int n = y * width; |
111 | | |
112 | | #if USE_AVX2 |
113 | 526k | for( ; x + 7 < endX; x += 8, n += 8 ) |
114 | 160k | { |
115 | 160k | if( UseScalingList ) |
116 | 0 | { |
117 | 0 | xvScale = _mm256_set1_epi32( scaleQP ); |
118 | 0 | xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) ); |
119 | 0 | } |
120 | | |
121 | 160k | __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs |
122 | 32.3k | : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs |
123 | | |
124 | | xvLevel = _mm256_max_epi32( xvLevel, xvInputMin ); |
125 | | xvLevel = _mm256_min_epi32( xvLevel, xvInputMax ); |
126 | | |
127 | | xvLevel = _mm256_mullo_epi32( xvLevel, xvScale ); |
128 | 160k | if( RightShiftPositive ) |
129 | 63.9k | { |
130 | 63.9k | xvLevel = _mm256_add_epi32( xvLevel, xvAdd ); |
131 | 63.9k | xvLevel = _mm256_sra_epi32( xvLevel, vShift ); |
132 | 63.9k | } |
133 | 96.8k | else |
134 | 96.8k | { |
135 | 96.8k | xvLevel = _mm256_sll_epi32( xvLevel, vShift ); |
136 | 96.8k | } |
137 | | |
138 | | xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin ); |
139 | | xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax ); |
140 | | |
141 | 160k | _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel ); |
142 | 160k | } |
143 | | #endif // USE_AVX2 |
144 | | |
145 | 624k | for( ; x + 3 < endX; x += 4, n += 4 ) |
146 | 258k | { |
147 | 258k | if( UseScalingList ) |
148 | 0 | { |
149 | 0 | vScale = _mm_set1_epi32( scaleQP ); |
150 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); |
151 | 0 | } |
152 | | |
153 | 258k | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs |
154 | 258k | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs |
155 | | |
156 | 258k | vLevel = _mm_max_epi32( vLevel, vInputMin ); |
157 | 258k | vLevel = _mm_min_epi32( vLevel, vInputMax ); |
158 | | |
159 | 258k | vLevel = _mm_mullo_epi32( vLevel, vScale ); |
160 | 258k | if( RightShiftPositive ) |
161 | 27.0k | { |
162 | 27.0k | vLevel = _mm_add_epi32( vLevel, vAdd ); |
163 | 27.0k | vLevel = _mm_sra_epi32( vLevel, vShift ); |
164 | 27.0k | } |
165 | 231k | else |
166 | 231k | { |
167 | 231k | vLevel = _mm_sll_epi32( vLevel, vShift ); |
168 | 231k | } |
169 | | |
170 | 258k | vLevel = _mm_max_epi32( vLevel, vTransformMin ); |
171 | 258k | vLevel = _mm_min_epi32( vLevel, vTransformMax ); |
172 | | |
173 | 258k | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); |
174 | 258k | } |
175 | | |
176 | | #if 0 // dequant remaining coefficients using scalar code |
177 | | (void)vMask; |
178 | | for( ; x < endX; x++, n++ ) |
179 | | { |
180 | | const TCoeff level = piQCoef[x + y * piQCfStride]; |
181 | | if( !level ) |
182 | | { |
183 | | continue; |
184 | | } |
185 | | |
186 | | const int scale = UseScalingList ? piDequantCoef[n] * scaleQP // |
187 | | : scaleQP; |
188 | | const TCoeff clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) ); |
189 | | Intermediate_Int iCoeffQ = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift // |
190 | | : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift ); |
191 | | |
192 | | piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) ); |
193 | | } |
194 | | |
195 | | #else // dequant remaining coefficients using SSE |
196 | | |
197 | 365k | if( x < endX ) |
198 | 1.80k | { |
199 | 1.80k | CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs ); |
200 | | |
201 | 1.80k | if( UseScalingList ) |
202 | 0 | { |
203 | 0 | vScale = _mm_set1_epi32( scaleQP ); |
204 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); |
205 | 0 | vScale = _mm_and_si128( vScale, vMask ); |
206 | 0 | } |
207 | | |
208 | 1.80k | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs |
209 | 1.80k | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs |
210 | | |
211 | 1.80k | vLevel = _mm_and_si128( vLevel, vMask ); |
212 | | |
213 | 1.80k | vLevel = _mm_max_epi32( vLevel, vInputMin ); |
214 | 1.80k | vLevel = _mm_min_epi32( vLevel, vInputMax ); |
215 | | |
216 | 1.80k | vLevel = _mm_mullo_epi32( vLevel, vScale ); |
217 | 1.80k | if( RightShiftPositive ) |
218 | 1.71k | { |
219 | 1.71k | vLevel = _mm_add_epi32( vLevel, vAdd ); |
220 | 1.71k | vLevel = _mm_sra_epi32( vLevel, vShift ); |
221 | 1.71k | } |
222 | 90 | else |
223 | 90 | { |
224 | 90 | vLevel = _mm_sll_epi32( vLevel, vShift ); |
225 | 90 | } |
226 | | |
227 | 1.80k | vLevel = _mm_max_epi32( vLevel, vTransformMin ); |
228 | 1.80k | vLevel = _mm_min_epi32( vLevel, vTransformMax ); |
229 | | |
230 | 1.80k | if( maskCoeffs <= 2 ) |
231 | 664 | { |
232 | 664 | _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel ); |
233 | 664 | } |
234 | 1.14k | else |
235 | 1.14k | { |
236 | 1.14k | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); |
237 | 1.14k | } |
238 | 1.80k | } |
239 | 365k | #endif |
240 | 365k | } |
241 | 59.8k | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, false, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, false, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, false, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, false, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, true, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, short, true, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, true, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)1, int, true, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, false, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Line | Count | Source | 69 | 5.97k | { | 70 | 5.97k | static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" ); | 71 | | | 72 | 5.97k | constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t ); | 73 | | | 74 | 5.97k | const int inputMinimum = -( inputMaximum + 1 ); | 75 | 5.97k | const TCoeff transformMinimum = -( transformMaximum + 1 ); | 76 | | | 77 | 5.97k | const int iAdd = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0; | 78 | 5.97k | const int shift = RightShiftPositive ? rightShift : -rightShift; | 79 | | | 80 | 5.97k | const __m128i vInputMin = _mm_set1_epi32( inputMinimum ); | 81 | 5.97k | const __m128i vInputMax = _mm_set1_epi32( inputMaximum ); | 82 | 5.97k | const __m128i vTransformMin = _mm_set1_epi32( transformMinimum ); | 83 | 5.97k | const __m128i vTransformMax = _mm_set1_epi32( transformMaximum ); | 84 | | | 85 | 5.97k | const __m128i vAdd = _mm_set1_epi32( iAdd ); | 86 | 5.97k | const __m128i vShift = _mm_set_epi64x( 0, shift ); | 87 | 5.97k | __m128i vScale = _mm_set1_epi32( scaleQP ); | 88 | | | 89 | 5.97k | #if USE_AVX2 | 90 | 5.97k | const __m256i xvInputMin = _mm256_set1_epi32( inputMinimum ); | 91 | 5.97k | const __m256i xvInputMax = _mm256_set1_epi32( inputMaximum ); | 92 | 5.97k | const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum ); | 93 | 5.97k | const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum ); | 94 | | | 95 | 5.97k | const __m256i xvAdd = _mm256_set1_epi32( iAdd ); | 96 | 5.97k | __m256i xvScale = _mm256_set1_epi32( scaleQP ); | 97 | 5.97k | #endif // USE_AVX2 | 98 | | | 99 | 5.97k | const int endX = maxX + 1; | 100 | 5.97k | const int maskCoeffs = endX & 3; // number of coefficients in the last vector read | 101 | | // clang-format off | 102 | 5.97k | const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) : | 103 | 5.97k | ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 ) | 104 | 5.82k | : _mm_set_epi32( 0, 0, 0, -1 ) ); | 105 | | // clang-format on | 106 | | | 107 | 46.6k | for( int y = 0; y <= maxY; y++ ) | 108 | 40.6k | { | 109 | 40.6k | int x = 0; | 110 | 40.6k | int n = y * width; | 111 | | | 112 | 40.6k | #if USE_AVX2 | 113 | 75.1k | for( ; x + 7 < endX; x += 8, n += 8 ) | 114 | 34.5k | { | 115 | 34.5k | if( UseScalingList ) | 116 | 0 | { | 117 | 0 | xvScale = _mm256_set1_epi32( scaleQP ); | 118 | 0 | xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) ); | 119 | 0 | } | 120 | | | 121 | 34.5k | __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 122 | 34.5k | : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 123 | | | 124 | 34.5k | xvLevel = _mm256_max_epi32( xvLevel, xvInputMin ); | 125 | 34.5k | xvLevel = _mm256_min_epi32( xvLevel, xvInputMax ); | 126 | | | 127 | 34.5k | xvLevel = _mm256_mullo_epi32( xvLevel, xvScale ); | 128 | 34.5k | if( RightShiftPositive ) | 129 | 34.5k | { | 130 | 34.5k | xvLevel = _mm256_add_epi32( xvLevel, xvAdd ); | 131 | 34.5k | xvLevel = _mm256_sra_epi32( xvLevel, vShift ); | 132 | 34.5k | } | 133 | 0 | else | 134 | 0 | { | 135 | 0 | xvLevel = _mm256_sll_epi32( xvLevel, vShift ); | 136 | 0 | } | 137 | | | 138 | 34.5k | xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin ); | 139 | 34.5k | xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax ); | 140 | | | 141 | 34.5k | _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel ); | 142 | 34.5k | } | 143 | 40.6k | #endif // USE_AVX2 | 144 | | | 145 | 64.0k | for( ; x + 3 < endX; x += 4, n += 4 ) | 146 | 23.4k | { | 147 | 23.4k | if( UseScalingList ) | 148 | 0 | { | 149 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 150 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 151 | 0 | } | 152 | | | 153 | 23.4k | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 154 | 23.4k | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 155 | | | 156 | 23.4k | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 157 | 23.4k | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 158 | | | 159 | 23.4k | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 160 | 23.4k | if( RightShiftPositive ) | 161 | 23.4k | { | 162 | 23.4k | vLevel = _mm_add_epi32( vLevel, vAdd ); | 163 | 23.4k | vLevel = _mm_sra_epi32( vLevel, vShift ); | 164 | 23.4k | } | 165 | 0 | else | 166 | 0 | { | 167 | 0 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 168 | 0 | } | 169 | | | 170 | 23.4k | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 171 | 23.4k | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 172 | | | 173 | 23.4k | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 174 | 23.4k | } | 175 | | | 176 | | #if 0 // dequant remaining coefficients using scalar code | 177 | | (void)vMask; | 178 | | for( ; x < endX; x++, n++ ) | 179 | | { | 180 | | const TCoeff level = piQCoef[x + y * piQCfStride]; | 181 | | if( !level ) | 182 | | { | 183 | | continue; | 184 | | } | 185 | | | 186 | | const int scale = UseScalingList ? piDequantCoef[n] * scaleQP // | 187 | | : scaleQP; | 188 | | const TCoeff clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) ); | 189 | | Intermediate_Int iCoeffQ = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift // | 190 | | : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift ); | 191 | | | 192 | | piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) ); | 193 | | } | 194 | | | 195 | | #else // dequant remaining coefficients using SSE | 196 | | | 197 | 40.6k | if( x < endX ) | 198 | 1.71k | { | 199 | 1.71k | CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs ); | 200 | | | 201 | 1.71k | if( UseScalingList ) | 202 | 0 | { | 203 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 204 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 205 | 0 | vScale = _mm_and_si128( vScale, vMask ); | 206 | 0 | } | 207 | | | 208 | 1.71k | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 209 | 1.71k | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 210 | | | 211 | 1.71k | vLevel = _mm_and_si128( vLevel, vMask ); | 212 | | | 213 | 1.71k | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 214 | 1.71k | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 215 | | | 216 | 1.71k | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 217 | 1.71k | if( RightShiftPositive ) | 218 | 1.71k | { | 219 | 1.71k | vLevel = _mm_add_epi32( vLevel, vAdd ); | 220 | 1.71k | vLevel = _mm_sra_epi32( vLevel, vShift ); | 221 | 1.71k | } | 222 | 0 | else | 223 | 0 | { | 224 | 0 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 225 | 0 | } | 226 | | | 227 | 1.71k | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 228 | 1.71k | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 229 | | | 230 | 1.71k | if( maskCoeffs <= 2 ) | 231 | 627 | { | 232 | 627 | _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel ); | 233 | 627 | } | 234 | 1.09k | else | 235 | 1.09k | { | 236 | 1.09k | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 237 | 1.09k | } | 238 | 1.71k | } | 239 | 40.6k | #endif | 240 | 40.6k | } | 241 | 5.97k | } |
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, false, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Line | Count | Source | 69 | 51.6k | { | 70 | 51.6k | static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" ); | 71 | | | 72 | 51.6k | constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t ); | 73 | | | 74 | 51.6k | const int inputMinimum = -( inputMaximum + 1 ); | 75 | 51.6k | const TCoeff transformMinimum = -( transformMaximum + 1 ); | 76 | | | 77 | 51.6k | const int iAdd = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0; | 78 | 51.6k | const int shift = RightShiftPositive ? rightShift : -rightShift; | 79 | | | 80 | 51.6k | const __m128i vInputMin = _mm_set1_epi32( inputMinimum ); | 81 | 51.6k | const __m128i vInputMax = _mm_set1_epi32( inputMaximum ); | 82 | 51.6k | const __m128i vTransformMin = _mm_set1_epi32( transformMinimum ); | 83 | 51.6k | const __m128i vTransformMax = _mm_set1_epi32( transformMaximum ); | 84 | | | 85 | 51.6k | const __m128i vAdd = _mm_set1_epi32( iAdd ); | 86 | 51.6k | const __m128i vShift = _mm_set_epi64x( 0, shift ); | 87 | 51.6k | __m128i vScale = _mm_set1_epi32( scaleQP ); | 88 | | | 89 | 51.6k | #if USE_AVX2 | 90 | 51.6k | const __m256i xvInputMin = _mm256_set1_epi32( inputMinimum ); | 91 | 51.6k | const __m256i xvInputMax = _mm256_set1_epi32( inputMaximum ); | 92 | 51.6k | const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum ); | 93 | 51.6k | const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum ); | 94 | | | 95 | 51.6k | const __m256i xvAdd = _mm256_set1_epi32( iAdd ); | 96 | 51.6k | __m256i xvScale = _mm256_set1_epi32( scaleQP ); | 97 | 51.6k | #endif // USE_AVX2 | 98 | | | 99 | 51.6k | const int endX = maxX + 1; | 100 | 51.6k | const int maskCoeffs = endX & 3; // number of coefficients in the last vector read | 101 | | // clang-format off | 102 | 51.6k | const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) : | 103 | 51.6k | ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 ) | 104 | 51.5k | : _mm_set_epi32( 0, 0, 0, -1 ) ); | 105 | | // clang-format on | 106 | | | 107 | 352k | for( int y = 0; y <= maxY; y++ ) | 108 | 300k | { | 109 | 300k | int x = 0; | 110 | 300k | int n = y * width; | 111 | | | 112 | 300k | #if USE_AVX2 | 113 | 394k | for( ; x + 7 < endX; x += 8, n += 8 ) | 114 | 93.9k | { | 115 | 93.9k | if( UseScalingList ) | 116 | 0 | { | 117 | 0 | xvScale = _mm256_set1_epi32( scaleQP ); | 118 | 0 | xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) ); | 119 | 0 | } | 120 | | | 121 | 93.9k | __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 122 | 93.9k | : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 123 | | | 124 | 93.9k | xvLevel = _mm256_max_epi32( xvLevel, xvInputMin ); | 125 | 93.9k | xvLevel = _mm256_min_epi32( xvLevel, xvInputMax ); | 126 | | | 127 | 93.9k | xvLevel = _mm256_mullo_epi32( xvLevel, xvScale ); | 128 | 93.9k | if( RightShiftPositive ) | 129 | 0 | { | 130 | 0 | xvLevel = _mm256_add_epi32( xvLevel, xvAdd ); | 131 | 0 | xvLevel = _mm256_sra_epi32( xvLevel, vShift ); | 132 | 0 | } | 133 | 93.9k | else | 134 | 93.9k | { | 135 | 93.9k | xvLevel = _mm256_sll_epi32( xvLevel, vShift ); | 136 | 93.9k | } | 137 | | | 138 | 93.9k | xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin ); | 139 | 93.9k | xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax ); | 140 | | | 141 | 93.9k | _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel ); | 142 | 93.9k | } | 143 | 300k | #endif // USE_AVX2 | 144 | | | 145 | 531k | for( ; x + 3 < endX; x += 4, n += 4 ) | 146 | 231k | { | 147 | 231k | if( UseScalingList ) | 148 | 0 | { | 149 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 150 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 151 | 0 | } | 152 | | | 153 | 231k | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 154 | 231k | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 155 | | | 156 | 231k | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 157 | 231k | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 158 | | | 159 | 231k | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 160 | 231k | if( RightShiftPositive ) | 161 | 0 | { | 162 | 0 | vLevel = _mm_add_epi32( vLevel, vAdd ); | 163 | 0 | vLevel = _mm_sra_epi32( vLevel, vShift ); | 164 | 0 | } | 165 | 231k | else | 166 | 231k | { | 167 | 231k | vLevel = _mm_sll_epi32( vLevel, vShift ); | 168 | 231k | } | 169 | | | 170 | 231k | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 171 | 231k | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 172 | | | 173 | 231k | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 174 | 231k | } | 175 | | | 176 | | #if 0 // dequant remaining coefficients using scalar code | 177 | | (void)vMask; | 178 | | for( ; x < endX; x++, n++ ) | 179 | | { | 180 | | const TCoeff level = piQCoef[x + y * piQCfStride]; | 181 | | if( !level ) | 182 | | { | 183 | | continue; | 184 | | } | 185 | | | 186 | | const int scale = UseScalingList ? piDequantCoef[n] * scaleQP // | 187 | | : scaleQP; | 188 | | const TCoeff clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) ); | 189 | | Intermediate_Int iCoeffQ = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift // | 190 | | : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift ); | 191 | | | 192 | | piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) ); | 193 | | } | 194 | | | 195 | | #else // dequant remaining coefficients using SSE | 196 | | | 197 | 300k | if( x < endX ) | 198 | 90 | { | 199 | 90 | CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs ); | 200 | | | 201 | 90 | if( UseScalingList ) | 202 | 0 | { | 203 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 204 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 205 | 0 | vScale = _mm_and_si128( vScale, vMask ); | 206 | 0 | } | 207 | | | 208 | 90 | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 209 | 90 | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 210 | | | 211 | 90 | vLevel = _mm_and_si128( vLevel, vMask ); | 212 | | | 213 | 90 | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 214 | 90 | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 215 | | | 216 | 90 | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 217 | 90 | if( RightShiftPositive ) | 218 | 0 | { | 219 | 0 | vLevel = _mm_add_epi32( vLevel, vAdd ); | 220 | 0 | vLevel = _mm_sra_epi32( vLevel, vShift ); | 221 | 0 | } | 222 | 90 | else | 223 | 90 | { | 224 | 90 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 225 | 90 | } | 226 | | | 227 | 90 | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 228 | 90 | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 229 | | | 230 | 90 | if( maskCoeffs <= 2 ) | 231 | 37 | { | 232 | 37 | _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel ); | 233 | 37 | } | 234 | 53 | else | 235 | 53 | { | 236 | 53 | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 237 | 53 | } | 238 | 90 | } | 239 | 300k | #endif | 240 | 300k | } | 241 | 51.6k | } |
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, false, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Line | Count | Source | 69 | 2.10k | { | 70 | 2.10k | static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" ); | 71 | | | 72 | 2.10k | constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t ); | 73 | | | 74 | 2.10k | const int inputMinimum = -( inputMaximum + 1 ); | 75 | 2.10k | const TCoeff transformMinimum = -( transformMaximum + 1 ); | 76 | | | 77 | 2.10k | const int iAdd = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0; | 78 | 2.10k | const int shift = RightShiftPositive ? rightShift : -rightShift; | 79 | | | 80 | 2.10k | const __m128i vInputMin = _mm_set1_epi32( inputMinimum ); | 81 | 2.10k | const __m128i vInputMax = _mm_set1_epi32( inputMaximum ); | 82 | 2.10k | const __m128i vTransformMin = _mm_set1_epi32( transformMinimum ); | 83 | 2.10k | const __m128i vTransformMax = _mm_set1_epi32( transformMaximum ); | 84 | | | 85 | 2.10k | const __m128i vAdd = _mm_set1_epi32( iAdd ); | 86 | 2.10k | const __m128i vShift = _mm_set_epi64x( 0, shift ); | 87 | 2.10k | __m128i vScale = _mm_set1_epi32( scaleQP ); | 88 | | | 89 | 2.10k | #if USE_AVX2 | 90 | 2.10k | const __m256i xvInputMin = _mm256_set1_epi32( inputMinimum ); | 91 | 2.10k | const __m256i xvInputMax = _mm256_set1_epi32( inputMaximum ); | 92 | 2.10k | const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum ); | 93 | 2.10k | const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum ); | 94 | | | 95 | 2.10k | const __m256i xvAdd = _mm256_set1_epi32( iAdd ); | 96 | 2.10k | __m256i xvScale = _mm256_set1_epi32( scaleQP ); | 97 | 2.10k | #endif // USE_AVX2 | 98 | | | 99 | 2.10k | const int endX = maxX + 1; | 100 | 2.10k | const int maskCoeffs = endX & 3; // number of coefficients in the last vector read | 101 | | // clang-format off | 102 | 2.10k | const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) : | 103 | 2.10k | ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 ) | 104 | 2.10k | : _mm_set_epi32( 0, 0, 0, -1 ) ); | 105 | | // clang-format on | 106 | | | 107 | 24.1k | for( int y = 0; y <= maxY; y++ ) | 108 | 22.0k | { | 109 | 22.0k | int x = 0; | 110 | 22.0k | int n = y * width; | 111 | | | 112 | 22.0k | #if USE_AVX2 | 113 | 51.4k | for( ; x + 7 < endX; x += 8, n += 8 ) | 114 | 29.4k | { | 115 | 29.4k | if( UseScalingList ) | 116 | 0 | { | 117 | 0 | xvScale = _mm256_set1_epi32( scaleQP ); | 118 | 0 | xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) ); | 119 | 0 | } | 120 | | | 121 | 29.4k | __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 122 | 29.4k | : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 123 | | | 124 | 29.4k | xvLevel = _mm256_max_epi32( xvLevel, xvInputMin ); | 125 | 29.4k | xvLevel = _mm256_min_epi32( xvLevel, xvInputMax ); | 126 | | | 127 | 29.4k | xvLevel = _mm256_mullo_epi32( xvLevel, xvScale ); | 128 | 29.4k | if( RightShiftPositive ) | 129 | 29.4k | { | 130 | 29.4k | xvLevel = _mm256_add_epi32( xvLevel, xvAdd ); | 131 | 29.4k | xvLevel = _mm256_sra_epi32( xvLevel, vShift ); | 132 | 29.4k | } | 133 | 0 | else | 134 | 0 | { | 135 | 0 | xvLevel = _mm256_sll_epi32( xvLevel, vShift ); | 136 | 0 | } | 137 | | | 138 | 29.4k | xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin ); | 139 | 29.4k | xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax ); | 140 | | | 141 | 29.4k | _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel ); | 142 | 29.4k | } | 143 | 22.0k | #endif // USE_AVX2 | 144 | | | 145 | 25.6k | for( ; x + 3 < endX; x += 4, n += 4 ) | 146 | 3.62k | { | 147 | 3.62k | if( UseScalingList ) | 148 | 0 | { | 149 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 150 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 151 | 0 | } | 152 | | | 153 | 3.62k | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 154 | 3.62k | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 155 | | | 156 | 3.62k | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 157 | 3.62k | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 158 | | | 159 | 3.62k | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 160 | 3.62k | if( RightShiftPositive ) | 161 | 3.62k | { | 162 | 3.62k | vLevel = _mm_add_epi32( vLevel, vAdd ); | 163 | 3.62k | vLevel = _mm_sra_epi32( vLevel, vShift ); | 164 | 3.62k | } | 165 | 0 | else | 166 | 0 | { | 167 | 0 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 168 | 0 | } | 169 | | | 170 | 3.62k | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 171 | 3.62k | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 172 | | | 173 | 3.62k | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 174 | 3.62k | } | 175 | | | 176 | | #if 0 // dequant remaining coefficients using scalar code | 177 | | (void)vMask; | 178 | | for( ; x < endX; x++, n++ ) | 179 | | { | 180 | | const TCoeff level = piQCoef[x + y * piQCfStride]; | 181 | | if( !level ) | 182 | | { | 183 | | continue; | 184 | | } | 185 | | | 186 | | const int scale = UseScalingList ? piDequantCoef[n] * scaleQP // | 187 | | : scaleQP; | 188 | | const TCoeff clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) ); | 189 | | Intermediate_Int iCoeffQ = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift // | 190 | | : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift ); | 191 | | | 192 | | piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) ); | 193 | | } | 194 | | | 195 | | #else // dequant remaining coefficients using SSE | 196 | | | 197 | 22.0k | if( x < endX ) | 198 | 0 | { | 199 | 0 | CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs ); | 200 | |
| 201 | 0 | if( UseScalingList ) | 202 | 0 | { | 203 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 204 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 205 | 0 | vScale = _mm_and_si128( vScale, vMask ); | 206 | 0 | } | 207 | |
| 208 | 0 | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 209 | 0 | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 210 | |
| 211 | 0 | vLevel = _mm_and_si128( vLevel, vMask ); | 212 | |
| 213 | 0 | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 214 | 0 | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 215 | |
| 216 | 0 | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 217 | 0 | if( RightShiftPositive ) | 218 | 0 | { | 219 | 0 | vLevel = _mm_add_epi32( vLevel, vAdd ); | 220 | 0 | vLevel = _mm_sra_epi32( vLevel, vShift ); | 221 | 0 | } | 222 | 0 | else | 223 | 0 | { | 224 | 0 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 225 | 0 | } | 226 | |
| 227 | 0 | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 228 | 0 | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 229 | |
| 230 | 0 | if( maskCoeffs <= 2 ) | 231 | 0 | { | 232 | 0 | _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel ); | 233 | 0 | } | 234 | 0 | else | 235 | 0 | { | 236 | 0 | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 237 | 0 | } | 238 | 0 | } | 239 | 22.0k | #endif | 240 | 22.0k | } | 241 | 2.10k | } |
Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, false, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Line | Count | Source | 69 | 198 | { | 70 | 198 | static_assert( sizeof( piQCoef[0] ) == sizeof( int16_t ) || sizeof( piQCoef[0] ) == sizeof( int32_t ), "wrong coeff type" ); | 71 | | | 72 | 198 | constexpr static bool QCoef_16bit = sizeof( piQCoef[0] ) == sizeof( int16_t ); | 73 | | | 74 | 198 | const int inputMinimum = -( inputMaximum + 1 ); | 75 | 198 | const TCoeff transformMinimum = -( transformMaximum + 1 ); | 76 | | | 77 | 198 | const int iAdd = RightShiftPositive ? 1 << ( rightShift - 1 ) : 0; | 78 | 198 | const int shift = RightShiftPositive ? rightShift : -rightShift; | 79 | | | 80 | 198 | const __m128i vInputMin = _mm_set1_epi32( inputMinimum ); | 81 | 198 | const __m128i vInputMax = _mm_set1_epi32( inputMaximum ); | 82 | 198 | const __m128i vTransformMin = _mm_set1_epi32( transformMinimum ); | 83 | 198 | const __m128i vTransformMax = _mm_set1_epi32( transformMaximum ); | 84 | | | 85 | 198 | const __m128i vAdd = _mm_set1_epi32( iAdd ); | 86 | 198 | const __m128i vShift = _mm_set_epi64x( 0, shift ); | 87 | 198 | __m128i vScale = _mm_set1_epi32( scaleQP ); | 88 | | | 89 | 198 | #if USE_AVX2 | 90 | 198 | const __m256i xvInputMin = _mm256_set1_epi32( inputMinimum ); | 91 | 198 | const __m256i xvInputMax = _mm256_set1_epi32( inputMaximum ); | 92 | 198 | const __m256i xvTransformMin = _mm256_set1_epi32( transformMinimum ); | 93 | 198 | const __m256i xvTransformMax = _mm256_set1_epi32( transformMaximum ); | 94 | | | 95 | 198 | const __m256i xvAdd = _mm256_set1_epi32( iAdd ); | 96 | 198 | __m256i xvScale = _mm256_set1_epi32( scaleQP ); | 97 | 198 | #endif // USE_AVX2 | 98 | | | 99 | 198 | const int endX = maxX + 1; | 100 | 198 | const int maskCoeffs = endX & 3; // number of coefficients in the last vector read | 101 | | // clang-format off | 102 | 198 | const __m128i vMask = maskCoeffs == 3 ? _mm_set_epi32( 0, -1, -1, -1 ) : | 103 | 198 | ( maskCoeffs == 2 ? _mm_set_epi32( 0, 0, -1, -1 ) | 104 | 198 | : _mm_set_epi32( 0, 0, 0, -1 ) ); | 105 | | // clang-format on | 106 | | | 107 | 2.30k | for( int y = 0; y <= maxY; y++ ) | 108 | 2.10k | { | 109 | 2.10k | int x = 0; | 110 | 2.10k | int n = y * width; | 111 | | | 112 | 2.10k | #if USE_AVX2 | 113 | 5.00k | for( ; x + 7 < endX; x += 8, n += 8 ) | 114 | 2.90k | { | 115 | 2.90k | if( UseScalingList ) | 116 | 0 | { | 117 | 0 | xvScale = _mm256_set1_epi32( scaleQP ); | 118 | 0 | xvScale = _mm256_mullo_epi32( xvScale, _mm256_loadu_si256( (__m256i*) &piDequantCoef[n] ) ); | 119 | 0 | } | 120 | | | 121 | 2.90k | __m256i xvLevel = QCoef_16bit ? _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 122 | 2.90k | : _mm256_loadu_si256( (__m256i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 123 | | | 124 | 2.90k | xvLevel = _mm256_max_epi32( xvLevel, xvInputMin ); | 125 | 2.90k | xvLevel = _mm256_min_epi32( xvLevel, xvInputMax ); | 126 | | | 127 | 2.90k | xvLevel = _mm256_mullo_epi32( xvLevel, xvScale ); | 128 | 2.90k | if( RightShiftPositive ) | 129 | 0 | { | 130 | 0 | xvLevel = _mm256_add_epi32( xvLevel, xvAdd ); | 131 | 0 | xvLevel = _mm256_sra_epi32( xvLevel, vShift ); | 132 | 0 | } | 133 | 2.90k | else | 134 | 2.90k | { | 135 | 2.90k | xvLevel = _mm256_sll_epi32( xvLevel, vShift ); | 136 | 2.90k | } | 137 | | | 138 | 2.90k | xvLevel = _mm256_max_epi32( xvLevel, xvTransformMin ); | 139 | 2.90k | xvLevel = _mm256_min_epi32( xvLevel, xvTransformMax ); | 140 | | | 141 | 2.90k | _mm256_storeu_si256( (__m256i*) &piCoef[n], xvLevel ); | 142 | 2.90k | } | 143 | 2.10k | #endif // USE_AVX2 | 144 | | | 145 | 2.47k | for( ; x + 3 < endX; x += 4, n += 4 ) | 146 | 368 | { | 147 | 368 | if( UseScalingList ) | 148 | 0 | { | 149 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 150 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 151 | 0 | } | 152 | | | 153 | 368 | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 154 | 368 | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 155 | | | 156 | 368 | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 157 | 368 | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 158 | | | 159 | 368 | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 160 | 368 | if( RightShiftPositive ) | 161 | 0 | { | 162 | 0 | vLevel = _mm_add_epi32( vLevel, vAdd ); | 163 | 0 | vLevel = _mm_sra_epi32( vLevel, vShift ); | 164 | 0 | } | 165 | 368 | else | 166 | 368 | { | 167 | 368 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 168 | 368 | } | 169 | | | 170 | 368 | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 171 | 368 | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 172 | | | 173 | 368 | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 174 | 368 | } | 175 | | | 176 | | #if 0 // dequant remaining coefficients using scalar code | 177 | | (void)vMask; | 178 | | for( ; x < endX; x++, n++ ) | 179 | | { | 180 | | const TCoeff level = piQCoef[x + y * piQCfStride]; | 181 | | if( !level ) | 182 | | { | 183 | | continue; | 184 | | } | 185 | | | 186 | | const int scale = UseScalingList ? piDequantCoef[n] * scaleQP // | 187 | | : scaleQP; | 188 | | const TCoeff clipQCoef = TCoeff( Clip3<Intermediate_Int>( inputMinimum, inputMaximum, level ) ); | 189 | | Intermediate_Int iCoeffQ = RightShiftPositive ? ( Intermediate_Int( clipQCoef ) * scale + iAdd ) >> rightShift // | 190 | | : ( Intermediate_Int( clipQCoef ) * scale ) * ( 1 << shift ); | 191 | | | 192 | | piCoef[n] = TCoeff( Clip3<Intermediate_Int>( transformMinimum, transformMaximum, iCoeffQ ) ); | 193 | | } | 194 | | | 195 | | #else // dequant remaining coefficients using SSE | 196 | | | 197 | 2.10k | if( x < endX ) | 198 | 0 | { | 199 | 0 | CHECKD( endX - x >= 4 || endX - x != maskCoeffs, "wrong mask for remaining coeffs" << ( endX - x ) << " " << maskCoeffs ); | 200 | |
| 201 | 0 | if( UseScalingList ) | 202 | 0 | { | 203 | 0 | vScale = _mm_set1_epi32( scaleQP ); | 204 | 0 | vScale = _mm_mullo_epi32( vScale, _mm_loadu_si128( (__m128i*) &piDequantCoef[n] ) ); | 205 | 0 | vScale = _mm_and_si128( vScale, vMask ); | 206 | 0 | } | 207 | |
| 208 | 0 | __m128i vLevel = QCoef_16bit ? _mm_cvtepi16_epi32( _mm_loadu_si64( &piQCoef[x + y * piQCfStride] ) ) // 16 bit coeffs | 209 | 0 | : _mm_loadu_si128( (__m128i*) &piQCoef[x + y * piQCfStride] ); // 32 bit coeffs | 210 | |
| 211 | 0 | vLevel = _mm_and_si128( vLevel, vMask ); | 212 | |
| 213 | 0 | vLevel = _mm_max_epi32( vLevel, vInputMin ); | 214 | 0 | vLevel = _mm_min_epi32( vLevel, vInputMax ); | 215 | |
| 216 | 0 | vLevel = _mm_mullo_epi32( vLevel, vScale ); | 217 | 0 | if( RightShiftPositive ) | 218 | 0 | { | 219 | 0 | vLevel = _mm_add_epi32( vLevel, vAdd ); | 220 | 0 | vLevel = _mm_sra_epi32( vLevel, vShift ); | 221 | 0 | } | 222 | 0 | else | 223 | 0 | { | 224 | 0 | vLevel = _mm_sll_epi32( vLevel, vShift ); | 225 | 0 | } | 226 | |
| 227 | 0 | vLevel = _mm_max_epi32( vLevel, vTransformMin ); | 228 | 0 | vLevel = _mm_min_epi32( vLevel, vTransformMax ); | 229 | |
| 230 | 0 | if( maskCoeffs <= 2 ) | 231 | 0 | { | 232 | 0 | _mm_storeu_si64( (__m128i*) &piCoef[n], vLevel ); | 233 | 0 | } | 234 | 0 | else | 235 | 0 | { | 236 | 0 | _mm_storeu_si128( (__m128i*) &piCoef[n], vLevel ); | 237 | 0 | } | 238 | 0 | } | 239 | 2.10k | #endif | 240 | 2.10k | } | 241 | 198 | } |
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, true, true>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, short, true, false>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, true, true>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantImplSIMD<(vvdec::x86_simd::X86_VEXT)4, int, true, false>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) |
242 | | |
243 | | template<X86_VEXT vext, class T> |
244 | | static void DeQuantCoreSIMD( const SizeType width, |
245 | | const int maxX, |
246 | | const int maxY, |
247 | | const int scale, |
248 | | const T* const piQCoef, |
249 | | const size_t piQCfStride, |
250 | | TCoeff* const piCoef, |
251 | | const int rightShift, |
252 | | const int inputMaximum, |
253 | | const TCoeff transformMaximum ) |
254 | 79.9k | { |
255 | 79.9k | if( maxX < 2 ) |
256 | 20.1k | { |
257 | 20.1k | Quant::DeQuantCore<T>(width, |
258 | 20.1k | maxX, |
259 | 20.1k | maxY, |
260 | 20.1k | scale, |
261 | 20.1k | piQCoef, |
262 | 20.1k | piQCfStride, |
263 | 20.1k | piCoef, |
264 | 20.1k | rightShift, |
265 | 20.1k | inputMaximum, |
266 | 20.1k | transformMaximum ); |
267 | 20.1k | } |
268 | 59.8k | else if( rightShift > 0 ) |
269 | 8.07k | { |
270 | 8.07k | DeQuantImplSIMD<vext, T, false, true>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); |
271 | 8.07k | } |
272 | 51.7k | else |
273 | 51.7k | { |
274 | 51.7k | DeQuantImplSIMD<vext, T, false, false>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); |
275 | 51.7k | } |
276 | 79.9k | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, short>(unsigned int, int, int, int, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, int>(unsigned int, int, int, int, int const*, unsigned long, int*, int, int, int) Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, short>(unsigned int, int, int, int, short const*, unsigned long, int*, int, int, int) Line | Count | Source | 254 | 77.6k | { | 255 | 77.6k | if( maxX < 2 ) | 256 | 20.1k | { | 257 | 20.1k | Quant::DeQuantCore<T>(width, | 258 | 20.1k | maxX, | 259 | 20.1k | maxY, | 260 | 20.1k | scale, | 261 | 20.1k | piQCoef, | 262 | 20.1k | piQCfStride, | 263 | 20.1k | piCoef, | 264 | 20.1k | rightShift, | 265 | 20.1k | inputMaximum, | 266 | 20.1k | transformMaximum ); | 267 | 20.1k | } | 268 | 57.5k | else if( rightShift > 0 ) | 269 | 5.97k | { | 270 | 5.97k | DeQuantImplSIMD<vext, T, false, true>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); | 271 | 5.97k | } | 272 | 51.5k | else | 273 | 51.5k | { | 274 | 51.5k | DeQuantImplSIMD<vext, T, false, false>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); | 275 | 51.5k | } | 276 | 77.6k | } |
Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, int>(unsigned int, int, int, int, int const*, unsigned long, int*, int, int, int) Line | Count | Source | 254 | 2.29k | { | 255 | 2.29k | if( maxX < 2 ) | 256 | 0 | { | 257 | 0 | Quant::DeQuantCore<T>(width, | 258 | 0 | maxX, | 259 | 0 | maxY, | 260 | 0 | scale, | 261 | 0 | piQCoef, | 262 | 0 | piQCfStride, | 263 | 0 | piCoef, | 264 | 0 | rightShift, | 265 | 0 | inputMaximum, | 266 | 0 | transformMaximum ); | 267 | 0 | } | 268 | 2.29k | else if( rightShift > 0 ) | 269 | 2.10k | { | 270 | 2.10k | DeQuantImplSIMD<vext, T, false, true>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); | 271 | 2.10k | } | 272 | 198 | else | 273 | 198 | { | 274 | 198 | DeQuantImplSIMD<vext, T, false, false>( width, maxX, maxY, scale, nullptr, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); | 275 | 198 | } | 276 | 2.29k | } |
|
277 | | |
278 | | template<X86_VEXT vext, class T> |
279 | | static void DeQuantScalingCoreSIMD( const SizeType width, |
280 | | const int maxX, |
281 | | const int maxY, |
282 | | const int scaleQP, |
283 | | const int* piDequantCoef, |
284 | | const T* const piQCoef, |
285 | | const size_t piQCfStride, |
286 | | TCoeff* const piCoef, |
287 | | const int rightShift, |
288 | | const int inputMaximum, |
289 | | const TCoeff transformMaximum ) |
290 | 0 | { |
291 | 0 | if( maxX < 2 ) |
292 | 0 | { |
293 | 0 | Quant::DeQuantScalingCore<T>(width, |
294 | 0 | maxX, |
295 | 0 | maxY, |
296 | 0 | scaleQP, |
297 | 0 | piDequantCoef, |
298 | 0 | piQCoef, |
299 | 0 | piQCfStride, |
300 | 0 | piCoef, |
301 | 0 | rightShift, |
302 | 0 | inputMaximum, |
303 | 0 | transformMaximum ); |
304 | 0 | } |
305 | 0 | else if( rightShift > 0 ) |
306 | 0 | { |
307 | 0 | DeQuantImplSIMD<vext, T, true, true>( width, maxX, maxY, scaleQP, piDequantCoef, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); |
308 | 0 | } |
309 | 0 | else |
310 | 0 | { |
311 | 0 | DeQuantImplSIMD<vext, T, true, false>( width, maxX, maxY, scaleQP, piDequantCoef, piQCoef, piQCfStride, piCoef, rightShift, inputMaximum, transformMaximum ); |
312 | 0 | } |
313 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, short>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1, int>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, short>(unsigned int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4, int>(unsigned int, int, int, int, int const*, int const*, unsigned long, int*, int, int, int) |
314 | | |
315 | | template<X86_VEXT vext> |
316 | | void Quant::_initQuantX86() |
317 | 59.8k | { |
318 | 59.8k | DeQuant = DeQuantCoreSIMD<vext, TCoeffSig>; |
319 | 59.8k | DeQuantPCM = DeQuantCoreSIMD<vext, TCoeff>; |
320 | 59.8k | DeQuantScaling = DeQuantScalingCoreSIMD<vext, TCoeffSig>; |
321 | 59.8k | DeQuantScalingPCM = DeQuantScalingCoreSIMD<vext, TCoeff>; |
322 | 59.8k | } Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)1>() void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)4>() Line | Count | Source | 317 | 59.8k | { | 318 | 59.8k | DeQuant = DeQuantCoreSIMD<vext, TCoeffSig>; | 319 | 59.8k | DeQuantPCM = DeQuantCoreSIMD<vext, TCoeff>; | 320 | 59.8k | DeQuantScaling = DeQuantScalingCoreSIMD<vext, TCoeffSig>; | 321 | 59.8k | DeQuantScalingPCM = DeQuantScalingCoreSIMD<vext, TCoeff>; | 322 | 59.8k | } |
|
323 | | template void Quant::_initQuantX86<SIMDX86>(); |
324 | | |
325 | | #endif // TARGET_SIMD_X86 |
326 | | #endif // ENABLE_SIMD_OPT_QUANT |
327 | | |
328 | | } // namespace vvdec |