/src/vvenc/source/Lib/CommonLib/x86/QuantX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** |
43 | | * \file |
44 | | * \brief Implementation of quantization functions |
45 | | */ |
46 | | //#define USE_AVX2 |
47 | | // ==================================================================================================================== |
48 | | // Includes |
49 | | // ==================================================================================================================== |
50 | | |
51 | | #pragma once |
52 | | |
53 | | #include "CommonDefX86.h" |
54 | | #include "Rom.h" |
55 | | #include "QuantRDOQ2.h" |
56 | | |
57 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT_QUANT |
58 | | |
59 | | //! \ingroup CommonLib |
60 | | //! \{ |
61 | | |
62 | | namespace vvenc { |
63 | | |
64 | | #define cond_mm_prefetch(a,b) _mm_prefetch(a,b) |
65 | | //#define cond_mm_prefetch(a,b) |
66 | | |
67 | | static constexpr unsigned short levmask[16] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0}; |
68 | | template<X86_VEXT vext> |
69 | | static void DeQuantCoreSIMD(const int maxX,const int maxY,const int scale,const TCoeffSig *const piQCoef,const size_t piQCfStride,TCoeff *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum) |
70 | 0 | { |
71 | | // TODO: TCoeffSig!!! |
72 | 0 | const int inputMinimum = -(inputMaximum+1); |
73 | 0 | const TCoeff transformMinimum = -(transformMaximum+1); |
74 | 0 | const int width = maxX+1; |
75 | |
|
76 | 0 | __m128i vlevmask; |
77 | 0 | if (maxX<7) |
78 | 0 | vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] ); |
79 | 0 | else |
80 | 0 | vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff); |
81 | |
|
82 | 0 | if (rightShift>0) |
83 | 0 | { |
84 | 0 | const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1); |
85 | |
|
86 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
87 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
88 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
89 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
90 | 0 | __m128i v_scale = _mm_set1_epi16 ((short)scale); |
91 | 0 | __m128i v_add = _mm_set1_epi32 (iAdd); |
92 | 0 | __m128i v_rshift = _mm_set1_epi64x (rightShift); |
93 | |
|
94 | 0 | if (maxX<4) |
95 | 0 | { |
96 | 0 | for( int y = 0; y <= maxY; y++) |
97 | 0 | { |
98 | 0 | __m128i v_level = maxX == 1 ? _mm_set1_epi32( *( ( int const* ) & piQCoef[y * piQCfStride] ) ) : _vv_loadl_epi64( ( __m128i const* ) &piQCoef[y * piQCfStride] ); |
99 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
100 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
101 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
102 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
103 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
104 | |
|
105 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
106 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
107 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
108 | |
|
109 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
110 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
111 | 0 | if( maxX == 1 ) |
112 | 0 | _vv_storel_epi64( (__m128i*)(piCoef + y * width), v_level ); |
113 | 0 | else |
114 | 0 | _mm_storeu_si128( (__m128i*)(piCoef + y * width), v_level ); |
115 | 0 | } |
116 | 0 | } |
117 | 0 | else |
118 | 0 | { |
119 | 0 | for( int y = 0; y <= maxY; y++) |
120 | 0 | { |
121 | 0 | for( int x = 0; x <= maxX; x+=8) |
122 | 0 | { |
123 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const* ) &piQCoef[x + y * piQCfStride] ); |
124 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
125 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
126 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
127 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
128 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
129 | |
|
130 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
131 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
132 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
133 | |
|
134 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
135 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
136 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
137 | |
|
138 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
139 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
140 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
141 | |
|
142 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
143 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
144 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
145 | 0 | } |
146 | 0 | } |
147 | 0 | } |
148 | 0 | } |
149 | 0 | else // rightshift <0 |
150 | 0 | { |
151 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
152 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
153 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
154 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
155 | 0 | __m128i v_scale = _mm_set1_epi16 ((short)scale); |
156 | 0 | __m128i v_lshift = _mm_set1_epi64x (-rightShift); |
157 | |
|
158 | 0 | if (maxX<4) |
159 | 0 | { |
160 | 0 | for( int y = 0; y <= maxY; y++) |
161 | 0 | { |
162 | 0 | __m128i v_level = maxX == 1 ? _mm_set1_epi32( *( ( int const* ) & piQCoef[y * piQCfStride] ) ) : _vv_loadl_epi64( ( __m128i const* ) &piQCoef[y * piQCfStride] ); |
163 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
164 | |
|
165 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
166 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
167 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
168 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
169 | |
|
170 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
171 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
172 | |
|
173 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
174 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
175 | |
|
176 | 0 | if( maxX == 1 ) |
177 | 0 | { |
178 | 0 | _vv_storel_epi64( (__m128i*)(piCoef + y * width), v_level ); |
179 | 0 | } |
180 | 0 | else |
181 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
182 | 0 | } |
183 | 0 | } |
184 | 0 | else |
185 | 0 | { |
186 | 0 | for( int y = 0; y <= maxY; y++) |
187 | 0 | { |
188 | 0 | for( int x = 0; x <= maxX; x+=8) |
189 | 0 | { |
190 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const* ) &piQCoef[x + y * piQCfStride] ); |
191 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
192 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
193 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
194 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
195 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
196 | |
|
197 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
198 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
199 | |
|
200 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
201 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
202 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
203 | |
|
204 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
205 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
206 | |
|
207 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
208 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
209 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
210 | 0 | } |
211 | 0 | } |
212 | 0 | } |
213 | 0 | } |
214 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvenc::DeQuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvenc::DeQuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, short const*, unsigned long, int*, int, int, int) |
215 | | |
216 | | template<X86_VEXT vext> |
217 | | static void QuantCoreSIMD(const TransformUnit tu, const ComponentID compID, const CCoeffBuf& piCoef,CoeffSigBuf piQCoef,TCoeff &uiAbsSum, int &lastScanPos,TCoeff *deltaU,const int defaultQuantisationCoefficient,const int iQBits,const int64_t iAdd,const TCoeff entropyCodingMinimum,const TCoeff entropyCodingMaximum,const bool signHiding, const TCoeff m_thrVal) |
218 | 0 | { |
219 | 0 | CoeffCodingContext cctx( tu, compID, signHiding ); |
220 | |
|
221 | 0 | const CompArea &rect = tu.blocks[compID]; |
222 | 0 | const uint32_t uiWidth = rect.width; |
223 | 0 | const uint32_t uiHeight = rect.height; |
224 | 0 | const uint32_t log2CGSize = cctx.log2CGSize(); |
225 | |
|
226 | 0 | uiAbsSum = 0; |
227 | |
|
228 | 0 | const int iCGSize = 1 << log2CGSize; |
229 | |
|
230 | 0 | const uint32_t lfnstIdx = tu.cu->lfnstIdx; |
231 | 0 | const int iCGNum = lfnstIdx > 0 ? 1 : std::min<int>(JVET_C0024_ZERO_OUT_TH, uiWidth) * std::min<int>(JVET_C0024_ZERO_OUT_TH, uiHeight) >> cctx.log2CGSize(); |
232 | 0 | int iScanPos = ( iCGNum << log2CGSize ) - 1; |
233 | |
|
234 | 0 | if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 ) ) ) |
235 | 0 | { |
236 | 0 | iScanPos = 7; |
237 | 0 | } |
238 | | |
239 | | // Find first non-zero coeff |
240 | 0 | for( ; iScanPos > 0; iScanPos-- ) |
241 | 0 | { |
242 | 0 | uint32_t uiBlkPos = cctx.blockPos( iScanPos ); |
243 | 0 | if( piCoef.buf[uiBlkPos] ) |
244 | 0 | break; |
245 | 0 | } |
246 | | |
247 | | ////////////////////////////////////////////////////////////////////////// |
248 | | // Loop over sub-sets (coefficient groups) |
249 | | ////////////////////////////////////////////////////////////////////////// |
250 | | |
251 | 0 | TCoeff thres = 0, useThres = 0; |
252 | | |
253 | 0 | if( iQBits ) |
254 | 0 | thres = TCoeff( ( int64_t( m_thrVal ) << ( iQBits - 1 ) ) ); |
255 | 0 | else |
256 | 0 | thres = TCoeff( ( int64_t( m_thrVal >> 1 ) << iQBits ) ); |
257 | |
|
258 | 0 | useThres = thres / ( defaultQuantisationCoefficient << 2 ); |
259 | |
|
260 | 0 | const bool is4x4sbb = log2CGSize == 4 && cctx.log2CGWidth() == 2; |
261 | |
|
262 | 0 | int subSetId = iScanPos >> log2CGSize; |
263 | | // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold |
264 | 0 | if( is4x4sbb && iScanPos >= 16 ) |
265 | 0 | { |
266 | 0 | for( ; subSetId >= 1; subSetId-- ) |
267 | 0 | { |
268 | | // move the pointer to the beginning of the current subblock |
269 | 0 | const int iScanPosinCG = iScanPos & ( iCGSize - 1 ); |
270 | 0 | const int firstTestPos = iScanPos - iScanPosinCG; |
271 | 0 | uint32_t uiBlkPos = cctx.blockPos( firstTestPos ); |
272 | |
|
273 | 0 | const __m128i xdfTh = _mm_set1_epi32( useThres ); |
274 | | |
275 | | // read first line of the subblock and check for coefficients larger than the threshold |
276 | | // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width |
277 | 0 | __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) ); |
278 | 0 | __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh ); |
279 | | |
280 | | // same for the next line in the subblock |
281 | 0 | uiBlkPos += uiWidth; |
282 | 0 | xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) ); |
283 | 0 | xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); |
284 | | |
285 | | // and the third line |
286 | 0 | uiBlkPos += uiWidth; |
287 | 0 | xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) ); |
288 | 0 | xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); |
289 | | |
290 | | // and the last line |
291 | 0 | uiBlkPos += uiWidth; |
292 | 0 | xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &piCoef.buf[uiBlkPos] ) ); |
293 | 0 | xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); |
294 | |
|
295 | 0 | if( _mm_testz_si128( xdf, xdf ) ) |
296 | 0 | { |
297 | 0 | iScanPos -= iScanPosinCG + 1; |
298 | 0 | continue; |
299 | 0 | } |
300 | 0 | else |
301 | 0 | { |
302 | 0 | break; |
303 | 0 | } |
304 | 0 | } |
305 | 0 | } |
306 | |
|
307 | 0 | const int qBits8 = iQBits - 8; |
308 | 0 | piQCoef.memset( 0 ); |
309 | |
|
310 | 0 | lastScanPos = iScanPos; |
311 | |
|
312 | 0 | if( is4x4sbb && ( iScanPos & 15 ) == 15 ) |
313 | 0 | { |
314 | | #if defined( USE_AVX2 ) && 0 // sometimes has undefined behavior |
315 | | if( vext >= AVX2 ) |
316 | | { |
317 | | const __m256i vNull = _mm256_setzero_si256(); |
318 | | const __m256i vQuantCoeff = _mm256_set1_epi32(defaultQuantisationCoefficient); |
319 | | const __m256i vAdd = _mm256_set1_epi64x(iAdd); |
320 | | const __m256i vMax = _mm256_set1_epi32(entropyCodingMaximum); |
321 | | const __m256i vMin = _mm256_set1_epi32(entropyCodingMinimum); |
322 | | const __m256i vMask = _mm256_set_epi32( 0, -1, 0, -1, 0, -1, 0, -1 ); |
323 | | __m256i vAbsSum = vNull; |
324 | | |
325 | | for( subSetId = iScanPos >> log2CGSize; subSetId >= 0; subSetId-- ) |
326 | | { |
327 | | int uiBlockPos = cctx.blockPos( subSetId << log2CGSize ); |
328 | | |
329 | | for( int line = 0; line < 4; line += 2, uiBlockPos += ( 2 * uiWidth ) ) |
330 | | { |
331 | | __m256i vLevel = _mm256_castsi128_si256 ( _mm_loadu_si128((__m128i *)&piCoef.buf[uiBlockPos] ) ); // coeff7,coeff6,coeff5,coeff4,coeff3,coeff2,coeff1,coeff0, |
332 | | vLevel = _mm256_inserti128_si256( vLevel, _mm_loadu_si128((__m128i *)&piCoef.buf[uiBlockPos + uiWidth] ), 1 ); // coeff7,coeff6,coeff5,coeff4,coeff3,coeff2,coeff1,coeff0, |
333 | | __m256i vSign = _mm256_cmpgt_epi32 (vNull,vLevel); // sign3,sign2,sign1,sign0 FFFF or 0000 |
334 | | vLevel = _mm256_abs_epi32 (vLevel); |
335 | | __m256i vdeltaU0 = _mm256_mul_epu32(vLevel,vQuantCoeff); // Tmp2,Tmp0 |
336 | | __m256i vdeltaU1 = _mm256_srli_si256(vLevel,4); // abs(0,vLevel3,vLevel2,vLevel1) |
337 | | vdeltaU1 = _mm256_mul_epu32(vdeltaU1,vQuantCoeff); // Tmp3,Tmp1 |
338 | | __m256i vTmpLevel_0 = _mm256_add_epi64(vdeltaU0,vAdd); |
339 | | __m256i vTmpLevel_1 = _mm256_add_epi64(vdeltaU1,vAdd); |
340 | | const __m128i vQBits = _mm_cvtsi32_si128(iQBits) |
341 | | vTmpLevel_0 = _mm256_srl_epi64(vTmpLevel_0,vQBits); // Int32 Tmp2,Tmp0 |
342 | | vTmpLevel_1 = _mm256_srl_epi64(vTmpLevel_1,vQBits); // Int32 Tmp3,Tmp1 |
343 | | |
344 | | if (signHiding) |
345 | | { |
346 | | __m256i vBS0 = _mm256_sll_epi64(vTmpLevel_0,vQBits); |
347 | | __m256i vBS1 = _mm256_sll_epi64(vTmpLevel_1,vQBits); |
348 | | |
349 | | vdeltaU0 = _mm256_sub_epi64(vdeltaU0,vBS0); |
350 | | vdeltaU1 = _mm256_sub_epi64(vdeltaU1,vBS1); |
351 | | const __m128i vQBits8 = _mm_cvtsi32_si128(qBits8) |
352 | | vdeltaU0 = _mm256_srl_epi64(vdeltaU0,vQBits8); |
353 | | vdeltaU1 = _mm256_srl_epi64(vdeltaU1,vQBits8); |
354 | | vdeltaU0 = _mm256_and_si256(vdeltaU0,vMask); |
355 | | vdeltaU1 = _mm256_and_si256(vdeltaU1,vMask); |
356 | | vdeltaU1 = _mm256_slli_epi64(vdeltaU1,32); |
357 | | vdeltaU0 = _mm256_or_si256(vdeltaU0,vdeltaU1); |
358 | | _mm_storeu_si128( ( __m128i * )&deltaU[uiBlockPos], _mm256_castsi256_si128 (vdeltaU0)); |
359 | | _mm_storeu_si128( ( __m128i * )&deltaU[uiBlockPos + uiWidth],_mm256_extracti128_si256(vdeltaU0, 1)); |
360 | | } |
361 | | __m256i vquantMag0 = _mm256_and_si256(vTmpLevel_0,vMask); |
362 | | __m256i vquantMag1 = _mm256_and_si256(vTmpLevel_1,vMask); |
363 | | vquantMag1 = _mm256_slli_epi64(vquantMag1,32); |
364 | | vTmpLevel_0 = _mm256_or_si256(vquantMag0,vquantMag1); |
365 | | vAbsSum = _mm256_add_epi32(vAbsSum,vTmpLevel_0); |
366 | | vTmpLevel_1 = _mm256_and_si256(vTmpLevel_0,vSign); // mask only neg values |
367 | | vTmpLevel_0 = _mm256_andnot_si256(vSign,vTmpLevel_0); // mask only pos values |
368 | | vTmpLevel_0 = _mm256_sub_epi32(vTmpLevel_0,vTmpLevel_1); |
369 | | vTmpLevel_0 = _mm256_min_epi32(vMax, _mm256_max_epi32(vMin,vTmpLevel_0)); // clip to 16 Bit |
370 | | vTmpLevel_0 = _mm256_packs_epi32(vTmpLevel_0,vTmpLevel_0); |
371 | | _vv_storel_epi64( ( __m128i * )&piQCoef.buf[uiBlockPos], _mm256_castsi256_si128 (vTmpLevel_0)); |
372 | | _vv_storel_epi64( ( __m128i * )&piQCoef.buf[uiBlockPos + uiWidth],_mm256_extracti128_si256(vTmpLevel_0, 1)); |
373 | | } |
374 | | } |
375 | | |
376 | | __m128i xAbsSum = _mm_add_epi32( _mm256_castsi256_si128( vAbsSum ), _mm256_extracti128_si256( vAbsSum, 1 ) ); |
377 | | xAbsSum = _mm_hadd_epi32( xAbsSum, xAbsSum ); |
378 | | xAbsSum = _mm_hadd_epi32( xAbsSum, xAbsSum ); |
379 | | |
380 | | uiAbsSum += _mm_cvtsi128_si32( xAbsSum ); |
381 | | } //AVX2 |
382 | | else |
383 | | #endif |
384 | 0 | { |
385 | 0 | const __m128i vNull = _mm_setzero_si128(); |
386 | 0 | const __m128i vQuantCoeff = _mm_set1_epi32(defaultQuantisationCoefficient); |
387 | 0 | const __m128i vAdd = _mm_set1_epi64x(iAdd); |
388 | 0 | const __m128i vMin = _mm_set1_epi32(entropyCodingMinimum); |
389 | 0 | const __m128i vMax = _mm_set1_epi32(entropyCodingMaximum); |
390 | 0 | const __m128i vMask = _mm_set_epi32(0, -1, 0, -1); |
391 | 0 | __m128i vAbsSum = vNull; |
392 | |
|
393 | 0 | for( subSetId = iScanPos >> log2CGSize; subSetId >= 0; subSetId-- ) |
394 | 0 | { |
395 | 0 | int uiBlockPos = cctx.blockPos( subSetId << log2CGSize ); |
396 | |
|
397 | 0 | for( int line = 0; line < 4; line++, uiBlockPos += uiWidth ) |
398 | 0 | { |
399 | 0 | __m128i vLevel = _mm_loadu_si128((__m128i*)&piCoef.buf[uiBlockPos]); // coeff3,coeff2,coeff1,coeff0, |
400 | 0 | __m128i vSign = _mm_cmpgt_epi32 (vNull,vLevel); // sign3,sign2,sign1,sign0 FFFF or 0000 |
401 | 0 | vLevel = _mm_abs_epi32 (vLevel); // abs(vLevel3,vLevel2,vLevel1,vLevel0) |
402 | 0 | __m128i vdeltaU0 = _mm_mul_epu32(vLevel,vQuantCoeff); // Tmp2,Tmp0 |
403 | 0 | __m128i vdeltaU1 = _mm_srli_si128(vLevel,4); // abs(0,vLevel3,vLevel2,vLevel1) |
404 | 0 | vdeltaU1 = _mm_mul_epu32(vdeltaU1,vQuantCoeff); // Tmp3,Tmp1 |
405 | 0 | __m128i vTmpLevel_0 = _mm_add_epi64(vdeltaU0,vAdd); |
406 | 0 | __m128i vTmpLevel_1 = _mm_add_epi64(vdeltaU1,vAdd); |
407 | 0 | const __m128i vQBits = _mm_cvtsi32_si128(iQBits); |
408 | 0 | vTmpLevel_0 = _mm_srl_epi64(vTmpLevel_0,vQBits); // Int32 Tmp2,Tmp0 |
409 | 0 | vTmpLevel_1 = _mm_srl_epi64(vTmpLevel_1,vQBits); // Int32 Tmp3,Tmp1 |
410 | 0 | if (signHiding) |
411 | 0 | { |
412 | 0 | __m128i vBS0 = _mm_sll_epi64(vTmpLevel_0,vQBits); |
413 | 0 | __m128i vBS1 = _mm_sll_epi64(vTmpLevel_1,vQBits); |
414 | 0 | vdeltaU0 = _mm_sub_epi64(vdeltaU0,vBS0); |
415 | 0 | vdeltaU1 = _mm_sub_epi64(vdeltaU1,vBS1); |
416 | 0 | const __m128i vQBits8 = _mm_cvtsi32_si128(qBits8); |
417 | 0 | vdeltaU0 = _mm_srl_epi64(vdeltaU0,vQBits8); |
418 | 0 | vdeltaU1 = _mm_srl_epi64(vdeltaU1,vQBits8); |
419 | 0 | vdeltaU0 = _mm_and_si128(vdeltaU0,vMask); |
420 | 0 | vdeltaU1 = _mm_and_si128(vdeltaU1,vMask); |
421 | 0 | vdeltaU1 = _mm_slli_epi64(vdeltaU1,32); |
422 | 0 | vdeltaU0 = _mm_or_si128(vdeltaU0,vdeltaU1); |
423 | 0 | _mm_storeu_si128( ( __m128i * ) &deltaU[uiBlockPos],vdeltaU0); |
424 | 0 | } |
425 | 0 | __m128i vquantMag0 = _mm_and_si128(vTmpLevel_0,vMask); |
426 | 0 | __m128i vquantMag1 = _mm_and_si128(vTmpLevel_1,vMask); |
427 | 0 | vquantMag1 = _mm_slli_epi64(vquantMag1,32); |
428 | 0 | vTmpLevel_0 = _mm_or_si128(vquantMag0,vquantMag1); |
429 | 0 | vAbsSum = _mm_add_epi32(vAbsSum,vTmpLevel_0); |
430 | 0 | vTmpLevel_1 = _mm_and_si128(vTmpLevel_0,vSign); // mask only neg values |
431 | 0 | vTmpLevel_0 = _mm_andnot_si128(vSign,vTmpLevel_0); // mask only pos values |
432 | 0 | vTmpLevel_0 = _mm_sub_epi32(vTmpLevel_0,vTmpLevel_1); |
433 | 0 | vTmpLevel_0 = _mm_min_epi32(vMax, _mm_max_epi32(vMin,vTmpLevel_0)); // clip to 16 Bit |
434 | 0 | vTmpLevel_0 = _mm_packs_epi32(vTmpLevel_0,vTmpLevel_0); |
435 | 0 | _vv_storel_epi64( ( __m128i * ) &piQCoef.buf[uiBlockPos],vTmpLevel_0); |
436 | 0 | } |
437 | 0 | } |
438 | |
|
439 | 0 | vAbsSum = _mm_hadd_epi32( vAbsSum, vAbsSum ); |
440 | 0 | vAbsSum = _mm_hadd_epi32( vAbsSum, vAbsSum ); |
441 | |
|
442 | 0 | uiAbsSum += _mm_cvtsi128_si32( vAbsSum ); |
443 | 0 | } |
444 | 0 | } |
445 | 0 | else |
446 | 0 | { |
447 | 0 | for( int currPos = 0; currPos <= iScanPos; currPos++ ) |
448 | 0 | { |
449 | 0 | const int uiBlockPos = cctx.blockPos( currPos ); |
450 | 0 | const TCoeff iLevel = piCoef.buf[uiBlockPos]; |
451 | 0 | const TCoeff iSign = (iLevel < 0 ? -1: 1); |
452 | 0 | const int64_t tmpLevel = (int64_t)abs(iLevel) * defaultQuantisationCoefficient; |
453 | 0 | const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits); |
454 | 0 | if (signHiding) |
455 | 0 | { |
456 | 0 | deltaU[uiBlockPos] = (TCoeff)((tmpLevel - ((int64_t)quantisedMagnitude<<iQBits) )>> qBits8); |
457 | 0 | } |
458 | 0 | uiAbsSum += quantisedMagnitude; |
459 | 0 | const TCoeff quantisedCoefficient = quantisedMagnitude * iSign; |
460 | 0 | piQCoef.buf[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient ); |
461 | 0 | } // for n |
462 | 0 | } |
463 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvenc::QuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::TransformUnit, vvenc::ComponentID, vvenc::AreaBuf<int const> const&, vvenc::AreaBuf<short>, int&, int&, int*, int, int, long, int, int, bool, int) Unexecuted instantiation: Quant_avx2.cpp:void vvenc::QuantCoreSIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::TransformUnit, vvenc::ComponentID, vvenc::AreaBuf<int const> const&, vvenc::AreaBuf<short>, int&, int&, int*, int, int, long, int, int, bool, int) |
464 | | |
465 | | template<X86_VEXT vext> |
466 | | static bool NeedRdoqSIMD( const TCoeff* pCoeff, size_t numCoeff, int quantCoeff, int64_t offset, int shift ) |
467 | 0 | { |
468 | 0 | const __m128i vshift = _mm_cvtsi32_si128( shift ); |
469 | | #if USE_AVX2 |
470 | 0 | if( vext >= AVX2 && ( numCoeff & 15 ) == 0 ) |
471 | 0 | { |
472 | 0 | __m256i xqnt = _mm256_set1_epi32( quantCoeff ); |
473 | 0 | __m256i xoff = _mm256_set1_epi64x( offset ); |
474 | | |
475 | 0 | for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos += 16 ) |
476 | 0 | { |
477 | 0 | __m256i xcff = _mm256_loadu_si256( ( const __m256i* ) &pCoeff[uiBlockPos] ); |
478 | 0 | xcff = _mm256_abs_epi32( xcff ); |
479 | |
|
480 | 0 | __m256i xlvl1 = _mm256_mul_epi32( xcff, xqnt ); |
481 | 0 | xcff = _mm256_shuffle_epi32( xcff, 1 + ( 3 << 4 ) ); |
482 | 0 | __m256i xlvl2 = _mm256_mul_epi32( xcff, xqnt ); |
483 | 0 | xlvl1 = _mm256_add_epi64( xlvl1, xoff ); |
484 | 0 | xlvl2 = _mm256_add_epi64( xlvl2, xoff ); |
485 | 0 | xlvl1 = _mm256_srl_epi64( xlvl1, vshift ); |
486 | 0 | xlvl2 = _mm256_srl_epi64( xlvl2, vshift ); |
487 | |
|
488 | 0 | __m256i xany = _mm256_or_si256( xlvl1, xlvl2 ); |
489 | | |
490 | 0 | xcff = _mm256_loadu_si256( ( const __m256i* ) &pCoeff[uiBlockPos + 8] ); |
491 | 0 | xcff = _mm256_abs_epi32( xcff ); |
492 | | |
493 | 0 | xlvl1 = _mm256_mul_epi32( xcff, xqnt ); |
494 | 0 | xcff = _mm256_shuffle_epi32( xcff, 1 + ( 3 << 4 ) ); |
495 | 0 | xlvl2 = _mm256_mul_epi32( xcff, xqnt ); |
496 | 0 | xlvl1 = _mm256_add_epi64( xlvl1, xoff ); |
497 | 0 | xlvl2 = _mm256_add_epi64( xlvl2, xoff ); |
498 | 0 | xlvl1 = _mm256_srl_epi64( xlvl1, vshift ); |
499 | 0 | xlvl2 = _mm256_srl_epi64( xlvl2, vshift ); |
500 | |
|
501 | 0 | xany = _mm256_or_si256( xany, _mm256_or_si256( xlvl1, xlvl2 ) ); |
502 | |
|
503 | 0 | if( !_mm256_testz_si256( xany, xany ) ) |
504 | 0 | { |
505 | 0 | return true; |
506 | 0 | } |
507 | 0 | } |
508 | 0 | return false; |
509 | 0 | } |
510 | 0 | else if( vext >= AVX2 && ( numCoeff & 7 ) == 0 ) |
511 | 0 | { |
512 | 0 | __m256i xqnt = _mm256_set1_epi32( quantCoeff ); |
513 | 0 | __m256i xoff = _mm256_set1_epi64x( offset ); |
514 | |
|
515 | 0 | for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos += 8 ) |
516 | 0 | { |
517 | 0 | __m256i xcff = _mm256_loadu_si256( ( const __m256i* ) &pCoeff[uiBlockPos] ); |
518 | 0 | xcff = _mm256_abs_epi32( xcff ); |
519 | |
|
520 | 0 | __m256i xlvl1 = _mm256_mul_epi32( xcff, xqnt ); |
521 | 0 | xcff = _mm256_shuffle_epi32( xcff, 1 + ( 3 << 4 ) ); |
522 | 0 | __m256i xlvl2 = _mm256_mul_epi32( xcff, xqnt ); |
523 | 0 | xlvl1 = _mm256_add_epi64( xlvl1, xoff ); |
524 | 0 | xlvl2 = _mm256_add_epi64( xlvl2, xoff ); |
525 | 0 | xlvl1 = _mm256_srl_epi64( xlvl1, vshift ); |
526 | 0 | xlvl2 = _mm256_srl_epi64( xlvl2, vshift ); |
527 | |
|
528 | 0 | __m256i xany = _mm256_or_si256( xlvl1, xlvl2 ); |
529 | |
|
530 | 0 | if( !_mm256_testz_si256( xany, xany ) ) |
531 | 0 | { |
532 | 0 | return true; |
533 | 0 | } |
534 | 0 | } |
535 | 0 | return false; |
536 | 0 | } |
537 | 0 | else |
538 | 0 | #endif |
539 | 0 | if( ( numCoeff & 3 ) == 0 ) |
540 | 0 | { |
541 | 0 | __m128i xqnt = _mm_set1_epi32( quantCoeff ); |
542 | 0 | __m128i xoff = _mm_set1_epi64x( offset ); |
543 | |
|
544 | 0 | for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos += 4 ) |
545 | 0 | { |
546 | 0 | __m128i xcff = _mm_loadu_si128( ( const __m128i* ) &pCoeff[uiBlockPos] ); |
547 | 0 | xcff = _mm_abs_epi32( xcff ); |
548 | |
|
549 | 0 | __m128i xlvl1 = _mm_mul_epi32( xcff, xqnt ); |
550 | 0 | xcff = _mm_shuffle_epi32( xcff, 1 + ( 3 << 4 ) ); |
551 | 0 | __m128i xlvl2 = _mm_mul_epi32( xcff, xqnt ); |
552 | 0 | xlvl1 = _mm_add_epi64( xlvl1, xoff ); |
553 | 0 | xlvl2 = _mm_add_epi64( xlvl2, xoff ); |
554 | 0 | xlvl1 = _mm_srl_epi64( xlvl1, vshift ); |
555 | 0 | xlvl2 = _mm_srl_epi64( xlvl2, vshift ); |
556 | |
|
557 | 0 | __m128i xany = _mm_or_si128( xlvl1, xlvl2 ); |
558 | |
|
559 | 0 | if( !_mm_test_all_zeros( xany, xany ) ) |
560 | 0 | { |
561 | 0 | return true; |
562 | 0 | } |
563 | 0 | } |
564 | 0 | return false; |
565 | 0 | } |
566 | 0 | else |
567 | 0 | { |
568 | 0 | for( int uiBlockPos = 0; uiBlockPos < numCoeff; uiBlockPos++ ) |
569 | 0 | { |
570 | 0 | const TCoeff iLevel = pCoeff[uiBlockPos]; |
571 | 0 | const int64_t tmpLevel = ( int64_t ) std::abs( iLevel ) * quantCoeff; |
572 | 0 | const TCoeff quantisedMagnitude = TCoeff( ( tmpLevel + offset ) >> shift ); |
573 | |
|
574 | 0 | if( quantisedMagnitude != 0 ) |
575 | 0 | { |
576 | 0 | return true; |
577 | 0 | } |
578 | 0 | } // for n |
579 | 0 | return false; |
580 | 0 | } |
581 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:bool vvenc::NeedRdoqSIMD<(vvenc::x86_simd::X86_VEXT)1>(int const*, unsigned long, int, long, int) Unexecuted instantiation: Quant_avx2.cpp:bool vvenc::NeedRdoqSIMD<(vvenc::x86_simd::X86_VEXT)4>(int const*, unsigned long, int, long, int) |
582 | | |
583 | | template<X86_VEXT vext> |
584 | | void Quant::_initQuantX86() |
585 | 0 | { |
586 | 0 | xDeQuant = DeQuantCoreSIMD<vext>; |
587 | 0 | xQuant = QuantCoreSIMD <vext>; |
588 | 0 | xNeedRdoq = NeedRdoqSIMD <vext>; |
589 | 0 | } Unexecuted instantiation: void vvenc::Quant::_initQuantX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::Quant::_initQuantX86<(vvenc::x86_simd::X86_VEXT)4>() |
590 | | template void Quant::_initQuantX86<SIMDX86>(); |
591 | | |
592 | | |
593 | | } // namespace vvenc |
594 | | |
595 | | //! \} |
596 | | |
597 | | #endif // TARGET_SIMD_X86 |
598 | | |