/src/vvenc/source/Lib/CommonLib/x86/DepQuantX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | #include "DepQuant.h" |
44 | | |
45 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT_QUANT |
46 | | |
47 | | # include "x86/CommonDefX86.h" |
48 | | # include <simde/x86/sse4.1.h> |
49 | | #if defined( USE_SSE41 ) || !defined( REAL_TARGET_X86 ) |
50 | | # include <simde/x86/sse4.2.h> |
51 | | #endif |
52 | | |
53 | | #include <bitset> |
54 | | |
55 | | //! \ingroup CommonLib |
56 | | //! \{ |
57 | | namespace vvenc { |
58 | | |
59 | | #if USE_SSE41 && defined( REAL_TARGET_X86 ) |
60 | 0 | #define _my_cmpgt_epi64( a, b ) simde_mm_cmpgt_epi64( a, b ) |
61 | | #else |
62 | 0 | #define _my_cmpgt_epi64( a, b ) _mm_cmpgt_epi64( a, b ) |
63 | | #endif |
64 | | |
65 | | namespace DQInternSimd |
66 | | { |
67 | | template<X86_VEXT vext> |
68 | | static inline void updateStates( const DQIntern::ScanInfo& scanInfo, const DQIntern::Decisions& decisions, DQIntern::StateMem& curr ) |
69 | 0 | { |
70 | 0 | int8_t s[4] = { 0 }, t[4] = { 0 }, l[4] = { 0 }; |
71 | |
|
72 | 0 | __m128i v126_4 = _mm_setr_epi16( 126, 126, 126, 126, 4, 4, 4, 4 ); |
73 | 0 | __m128i v01 = _mm_setr_epi16( 1, 1, 1, 1, 1, 1, 1, 1 ); |
74 | 0 | __m128i v032 = _mm_setr_epi8( 0, 0, 0, 0, 32, 32, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0 ); |
75 | 0 | __m128i vn1 = _mm_set1_epi8( -1 ); |
76 | |
|
77 | 0 | static_assert( sizeof( curr.rdCost ) == sizeof( decisions.rdCost ), "Non-matching array size" ); |
78 | 0 | memcpy( curr.rdCost, decisions.rdCost, sizeof( decisions.rdCost ) ); |
79 | | |
80 | | // in signalling, the coeffs are always max 16 bit! |
81 | 0 | __m128i v = _mm_loadu_si64( decisions.absLevel ); |
82 | 0 | v = _mm_unpacklo_epi64( v, v ); |
83 | 0 | __m128i p = _mm_loadu_si32( decisions.prevId ); |
84 | 0 | _mm_storeu_si32( s, p ); // store previous state indexes |
85 | 0 | p = _mm_shuffle_epi32( p, 0 ); |
86 | 0 | __m128i n2 = _mm_cmplt_epi8( p, vn1 ); |
87 | 0 | __m128i a_1 = _mm_and_si128( v, v01 ); |
88 | 0 | __m128i a_m = _mm_min_epi16( v, _mm_add_epi16( v126_4, a_1 ) ); |
89 | 0 | a_m = _mm_packs_epi16( a_m, vn1 ); |
90 | 0 | a_m = _mm_or_si128( a_m, _mm_sign_epi8( v032, a_m ) ); |
91 | 0 | a_m = _mm_andnot_si128( n2, a_m ); |
92 | 0 | _mm_storeu_si32( l, a_m ); // store abs value |
93 | 0 | a_m = _mm_shuffle_epi32( a_m, 1 ); |
94 | 0 | _mm_storeu_si32( t, a_m ); // store store capped abs value |
95 | |
|
96 | 0 | { |
97 | 0 | const int ctxSize = 16 * 4; |
98 | 0 | const int regSize = 16; |
99 | |
|
100 | 0 | __m128i vshuf = _mm_loadu_si32( s ); |
101 | 0 | vshuf = _mm_shuffle_epi32( vshuf, 0 ); |
102 | 0 | __m128i vshufmask = _mm_cmplt_epi8( vshuf, _mm_setzero_si128() ); |
103 | 0 | vshuf = _mm_add_epi8( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) ); |
104 | 0 | vshuf = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask ); |
105 | |
|
106 | 0 | auto* tplAcc = &curr.tplAcc[0][0]; |
107 | 0 | auto* absVal = &curr.absVal[0][0]; |
108 | 0 | auto* sum1st = &curr.sum1st[0][0]; |
109 | |
|
110 | 0 | for( int i = 0; i < ctxSize; i += regSize ) |
111 | 0 | { |
112 | 0 | __m128i vtpl = _mm_loadu_si128( ( const __m128i* ) &tplAcc[i]); |
113 | 0 | vtpl = _mm_shuffle_epi8( vtpl, vshuf ); |
114 | 0 | _mm_storeu_si128( ( __m128i* ) &tplAcc[i], vtpl ); |
115 | |
|
116 | 0 | __m128i vval = _mm_loadu_si128( ( const __m128i* ) &absVal[i] ); |
117 | 0 | vval = _mm_shuffle_epi8( vval, vshuf ); |
118 | 0 | _mm_storeu_si128( ( __m128i* ) &absVal[i], vval ); |
119 | |
|
120 | 0 | __m128i vsum = _mm_loadu_si128( ( const __m128i* ) &sum1st[i] ); |
121 | 0 | vsum = _mm_shuffle_epi8( vsum, vshuf ); |
122 | 0 | _mm_storeu_si128( ( __m128i* ) &sum1st[i], vsum ); |
123 | 0 | } |
124 | |
|
125 | 0 | __m128i numSig = _mm_loadu_si32( curr.numSig ); |
126 | 0 | numSig = _mm_shuffle_epi8( numSig, vshuf ); |
127 | 0 | __m128i lvls = _mm_loadu_si32( l ); |
128 | 0 | lvls = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() ); |
129 | 0 | numSig = _mm_subs_epi8( numSig, lvls ); |
130 | 0 | _mm_storeu_si32( curr.numSig, numSig ); |
131 | |
|
132 | 0 | __m128i rsc = _mm_loadu_si32( curr.refSbbCtxId ); |
133 | 0 | rsc = _mm_shuffle_epi8( rsc, vshuf ); |
134 | 0 | rsc = _mm_blendv_epi8( rsc, vshuf, vshuf ); |
135 | 0 | _mm_storeu_si32( curr.refSbbCtxId, rsc ); |
136 | |
|
137 | 0 | vshuf = _mm_shuffle_epi8( vshuf, _mm_setr_epi8( 0, 0, 1, 1, 2, 2, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1 ) ); |
138 | 0 | vshuf = _mm_slli_epi16( vshuf, 1 ); |
139 | 0 | vshuf = _mm_add_epi8( vshuf, |
140 | 0 | _mm_blendv_epi8( _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 ), |
141 | 0 | _mm_setzero_si128(), |
142 | 0 | vshuf ) ); |
143 | |
|
144 | 0 | __m128i rrb = _mm_loadu_si64( ( const __m128i* ) curr.remRegBins ); |
145 | 0 | rrb = _mm_shuffle_epi8( rrb, vshuf ); |
146 | 0 | rrb = _mm_sub_epi16( rrb, v01 ); |
147 | 0 | rrb = _mm_blendv_epi8( rrb, _mm_set1_epi16( curr.initRemRegBins ), vshuf ); |
148 | 0 | __m128i mlvl = _mm_loadu_si32( l ); |
149 | 0 | __m128i mbins = _mm_min_epi8( mlvl, _mm_set1_epi8( 2 ) ); |
150 | 0 | __m128i mlutb = _mm_setr_epi8( 0, 1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); |
151 | 0 | rrb = _mm_sub_epi16( rrb, _mm_cvtepi8_epi16( _mm_shuffle_epi8( mlutb, mbins ) ) ); |
152 | 0 | _mm_storeu_si64( ( __m128i* ) curr.remRegBins, rrb ); |
153 | 0 | rrb = _mm_cmplt_epi16( rrb, _mm_set1_epi16( 4 ) ); |
154 | |
|
155 | 0 | curr.anyRemRegBinsLt4 = !!_mm_cvtsi128_si64( rrb ); |
156 | |
|
157 | 0 | __m128i lvl1 = _mm_loadu_si32( l ); |
158 | 0 | __m128i tpl1 = _mm_loadu_si32( t ); |
159 | |
|
160 | 0 | auto update_deps_vec = [&]( int k ) |
161 | 0 | { |
162 | 0 | int addr = scanInfo.currNbInfoSbb.invInPos[k]; |
163 | |
|
164 | 0 | __m128i msum = _mm_loadu_si32( &curr.sum1st[addr][0] ); |
165 | 0 | msum = _mm_adds_epu8( msum, mlvl ); |
166 | 0 | _mm_storeu_si32( &curr.sum1st[addr][0], msum); |
167 | |
|
168 | 0 | __m128i tpl = _mm_loadu_si32( &curr.tplAcc[addr][0] ); |
169 | 0 | tpl = _mm_add_epi8( tpl, tpl1 ); |
170 | 0 | _mm_storeu_si32( &curr.tplAcc[addr][0], tpl); |
171 | 0 | }; Unexecuted instantiation: DepQuant_sse41.cpp:vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)::{lambda(int)#1}::operator()(int) constUnexecuted instantiation: DepQuant_sse42.cpp:vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)::{lambda(int)#1}::operator()(int) constUnexecuted instantiation: DepQuant_avx2.cpp:vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)::{lambda(int)#1}::operator()(int) const |
172 | |
|
173 | 0 | switch( scanInfo.currNbInfoSbb.numInv ) |
174 | 0 | { |
175 | 0 | default: |
176 | 0 | case 5: |
177 | 0 | update_deps_vec( 4 ); |
178 | 0 | case 4: |
179 | 0 | update_deps_vec( 3 ); |
180 | 0 | case 3: |
181 | 0 | update_deps_vec( 2 ); |
182 | 0 | case 2: |
183 | 0 | update_deps_vec( 1 ); |
184 | 0 | case 1: |
185 | 0 | update_deps_vec( 0 ); |
186 | 0 | case 0: |
187 | 0 | ; |
188 | 0 | } |
189 | |
|
190 | 0 | _mm_storeu_si32( &curr.absVal[scanInfo.insidePos][0], lvl1); |
191 | 0 | } |
192 | |
|
193 | 0 | { |
194 | 0 | __m128i tplAcc = _mm_loadu_si32( &curr.tplAcc[scanInfo.nextInsidePos][0]); |
195 | |
|
196 | 0 | __m128i sumAbs1 = _mm_and_si128( tplAcc, _mm_set1_epi8( 31 ) ); |
197 | 0 | __m128i sumNum = _mm_and_si128( _mm_srli_epi32( tplAcc, 5 ), _mm_set1_epi8( 7 ) ); |
198 | 0 | __m128i sumGt1 = _mm_sub_epi8( sumAbs1, sumNum ); |
199 | 0 | sumGt1 = _mm_min_epi8( sumGt1, _mm_set1_epi8( 4 ) ); |
200 | 0 | sumGt1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.gtxCtxOffsetNext ), sumGt1 ); |
201 | 0 | _mm_storeu_si32( curr.ctx.cff, sumGt1 ); |
202 | |
|
203 | 0 | sumAbs1 = _mm_add_epi8( sumAbs1, _mm_set1_epi8( 1 ) ); |
204 | 0 | sumAbs1 = _mm_srli_epi32( sumAbs1, 1 ); |
205 | 0 | sumAbs1 = _mm_and_si128( sumAbs1, _mm_set1_epi8( 127 ) ); |
206 | 0 | sumAbs1 = _mm_min_epi8( sumAbs1, _mm_set1_epi8( 3 ) ); |
207 | 0 | sumAbs1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.sigCtxOffsetNext ), sumAbs1 ); |
208 | 0 | _mm_storeu_si32( curr.ctx.sig, sumAbs1 ); |
209 | |
|
210 | 0 | curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext; |
211 | 0 | } |
212 | 0 | } Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&) Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&) Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&) |
213 | | |
214 | | template<X86_VEXT vext> |
215 | | static inline void updateStatesEOS( const DQIntern::ScanInfo& scanInfo, const DQIntern::Decisions& decisions, const DQIntern::StateMem& skip, DQIntern::StateMem& curr, DQIntern::CommonCtx& commonCtx ) |
216 | 0 | { |
217 | 0 | int8_t s[4] = { 0 }, l[4] = { 0 }, z[4] = { 0 }; |
218 | 0 | for( int i = 0; i < 4; ++i ) |
219 | 0 | { |
220 | 0 | s[i] = decisions.prevId[i] >= 4 ? -2 : decisions.prevId[i]; |
221 | 0 | l[i] = s[i] > -2 ? std::min<int>( decisions.absLevel[i], 126 + ( decisions.absLevel[i] & 1 ) ) : 0; |
222 | 0 | z[i] = 3 - decisions.prevId[i]; |
223 | 0 | curr.rdCost[i] = decisions.rdCost[i]; |
224 | 0 | } |
225 | 0 | { |
226 | 0 | const int ctxSize = 16 * 4; |
227 | 0 | const int regSize = 16; |
228 | |
|
229 | 0 | __m128i vshuf = _mm_loadu_si32( s ); |
230 | 0 | vshuf = _mm_shuffle_epi32( vshuf, 0 ); |
231 | 0 | __m128i vshufmask = _mm_cmplt_epi8( vshuf, _mm_setzero_si128() ); |
232 | 0 | vshuf = _mm_add_epi8( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) ); |
233 | 0 | vshuf = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask ); |
234 | |
|
235 | 0 | auto* absVal = &curr.absVal[0][0]; |
236 | |
|
237 | 0 | for( int i = 0; i < ctxSize; i += regSize ) |
238 | 0 | { |
239 | 0 | __m128i vval = _mm_loadu_si128( ( const __m128i* ) &absVal[i] ); |
240 | 0 | vval = _mm_shuffle_epi8( vval, vshuf ); |
241 | 0 | _mm_storeu_si128( ( __m128i* ) &absVal[i], vval ); |
242 | 0 | } |
243 | |
|
244 | 0 | __m128i numSig = _mm_loadu_si32( curr.numSig ); |
245 | 0 | numSig = _mm_shuffle_epi8( numSig, vshuf ); |
246 | 0 | __m128i lvls = _mm_loadu_si32( l ); |
247 | 0 | _mm_storeu_si32( &curr.absVal[scanInfo.insidePos][0], lvls); |
248 | 0 | lvls = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() ); |
249 | 0 | numSig = _mm_subs_epi8( numSig, lvls ); |
250 | 0 | _mm_storeu_si32( curr.numSig, numSig ); |
251 | |
|
252 | 0 | __m128i rsc = _mm_loadu_si32( curr.refSbbCtxId ); |
253 | 0 | rsc = _mm_shuffle_epi8( rsc, vshuf ); |
254 | 0 | rsc = _mm_blendv_epi8( rsc, vshuf, vshuf ); |
255 | 0 | _mm_storeu_si32( curr.refSbbCtxId, rsc ); |
256 | |
|
257 | 0 | vshuf = _mm_shuffle_epi8( vshuf, _mm_setr_epi8( 0, 0, 1, 1, 2, 2, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1 ) ); |
258 | 0 | vshuf = _mm_slli_epi16( vshuf, 1 ); |
259 | 0 | vshuf = _mm_add_epi8( vshuf, |
260 | 0 | _mm_blendv_epi8( _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 ), |
261 | 0 | _mm_setzero_si128(), |
262 | 0 | vshuf ) ); |
263 | |
|
264 | 0 | __m128i rrb = _mm_loadu_si64( ( const __m128i* ) curr.remRegBins ); |
265 | 0 | rrb = _mm_shuffle_epi8( rrb, vshuf ); |
266 | 0 | rrb = _mm_sub_epi16( rrb, _mm_set1_epi16( 1 ) ); |
267 | 0 | rrb = _mm_blendv_epi8( rrb, _mm_set1_epi16( curr.initRemRegBins ), vshuf ); |
268 | |
|
269 | 0 | __m128i vskip = _mm_cvtepi8_epi16( _mm_loadu_si32( z ) ); |
270 | 0 | rrb = _mm_blendv_epi8( rrb, _mm_loadu_si64( ( const __m128i* ) skip.remRegBins ), vskip ); |
271 | |
|
272 | 0 | __m128i mlvl = _mm_loadu_si32( l ); |
273 | 0 | __m128i mbins = _mm_min_epi8( mlvl, _mm_set1_epi8( 2 ) ); |
274 | 0 | __m128i mlutb = _mm_setr_epi8( 0, 1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); |
275 | 0 | rrb = _mm_sub_epi16( rrb, _mm_cvtepi8_epi16( _mm_shuffle_epi8( mlutb, mbins ) ) ); |
276 | 0 | _mm_storeu_si64( ( __m128i* ) curr.remRegBins, rrb ); |
277 | 0 | rrb = _mm_cmplt_epi16( rrb, _mm_set1_epi16( 4 ) ); |
278 | |
|
279 | 0 | curr.anyRemRegBinsLt4 = !!_mm_cvtsi128_si64( rrb ); |
280 | 0 | } |
281 | |
|
282 | 0 | { |
283 | 0 | uint8_t* levels0; |
284 | 0 | uint8_t* levels1; |
285 | 0 | uint8_t* levels2; |
286 | 0 | uint8_t* levels3; |
287 | |
|
288 | 0 | commonCtx.getLevelPtrs( scanInfo, levels0, levels1, levels2, levels3 ); |
289 | |
|
290 | 0 | const int regSize = 16; |
291 | 0 | const int ctxSize = scanInfo.sbbSize << 2; |
292 | |
|
293 | 0 | const __m128i vshuf0 = _mm_setr_epi8( 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); |
294 | 0 | const __m128i vshuf1 = _mm_setr_epi8( 1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); |
295 | 0 | const __m128i vshuf2 = _mm_setr_epi8( 2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); |
296 | 0 | const __m128i vshuf3 = _mm_setr_epi8( 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); |
297 | |
|
298 | 0 | auto* absVal = &curr.absVal[0][0]; |
299 | |
|
300 | 0 | for( int i = 0, j = 0; i < ctxSize; i += regSize, j += 4 ) |
301 | 0 | { |
302 | 0 | __m128i in = _mm_loadu_si128( ( const __m128i* ) &absVal[i] ); |
303 | |
|
304 | 0 | _mm_storeu_si32( &levels0[j], _mm_shuffle_epi8( in, vshuf0 ) ); |
305 | 0 | _mm_storeu_si32( &levels1[j], _mm_shuffle_epi8( in, vshuf1 ) ); |
306 | 0 | _mm_storeu_si32( &levels2[j], _mm_shuffle_epi8( in, vshuf2 ) ); |
307 | 0 | _mm_storeu_si32( &levels3[j], _mm_shuffle_epi8( in, vshuf3 ) ); |
308 | 0 | } |
309 | 0 | } |
310 | |
|
311 | 0 | memset( curr.absVal, 0, sizeof( curr.absVal ) ); |
312 | 0 | memset( curr.tplAcc, 0, sizeof( curr.tplAcc ) ); |
313 | 0 | memset( curr.sum1st, 0, sizeof( curr.sum1st ) ); |
314 | |
|
315 | 0 | for( int i = 0; i < 4; i++ ) |
316 | 0 | { |
317 | 0 | int prevId = decisions.prevId[i]; |
318 | |
|
319 | 0 | if( prevId > -2 ) |
320 | 0 | { |
321 | 0 | const int refId = prevId < 0 ? -1 : ( prevId < 4 ? curr.refSbbCtxId[i] : prevId - 4 ); |
322 | 0 | commonCtx.update( scanInfo, refId, i, curr ); |
323 | 0 | } |
324 | 0 | } |
325 | |
|
326 | 0 | memset( curr.numSig, 0, sizeof( curr.numSig ) ); |
327 | |
|
328 | 0 | { |
329 | 0 | __m128i tplAcc = _mm_loadu_si32( &curr.tplAcc[scanInfo.nextInsidePos][0]); |
330 | |
|
331 | 0 | __m128i sumAbs1 = _mm_and_si128( tplAcc, _mm_set1_epi8( 31 ) ); |
332 | 0 | __m128i sumNum = _mm_and_si128( _mm_srli_epi32( tplAcc, 5 ), _mm_set1_epi8( 7 ) ); |
333 | 0 | __m128i sumGt1 = _mm_sub_epi8( sumAbs1, sumNum ); |
334 | 0 | sumGt1 = _mm_min_epi8( sumGt1, _mm_set1_epi8( 4 ) ); |
335 | 0 | sumGt1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.gtxCtxOffsetNext ), sumGt1 ); |
336 | 0 | _mm_storeu_si32( curr.ctx.cff, sumGt1 ); |
337 | |
|
338 | 0 | sumAbs1 = _mm_add_epi8( sumAbs1, _mm_set1_epi8( 1 ) ); |
339 | 0 | sumAbs1 = _mm_srli_epi32( sumAbs1, 1 ); |
340 | 0 | sumAbs1 = _mm_and_si128( sumAbs1, _mm_set1_epi8( 127 ) ); |
341 | 0 | sumAbs1 = _mm_min_epi8( sumAbs1, _mm_set1_epi8( 3 ) ); |
342 | 0 | sumAbs1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.sigCtxOffsetNext ), sumAbs1 ); |
343 | 0 | _mm_storeu_si32( curr.ctx.sig, sumAbs1 ); |
344 | |
|
345 | 0 | curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext; |
346 | 0 | } |
347 | 0 | } Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::updateStatesEOS<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem const&, vvenc::DQIntern::StateMem&, vvenc::DQIntern::CommonCtx&) Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::updateStatesEOS<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem const&, vvenc::DQIntern::StateMem&, vvenc::DQIntern::CommonCtx&) Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::updateStatesEOS<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem const&, vvenc::DQIntern::StateMem&, vvenc::DQIntern::CommonCtx&) |
348 | | |
349 | | // has to be called as a first check, assumes no decision has been made yet |
350 | | template<X86_VEXT vext> |
351 | | static void checkAllRdCosts( const DQIntern::ScanPosType spt, const DQIntern::PQData* pqData, DQIntern::Decisions& decisions, const DQIntern::StateMem& state ) |
352 | 0 | { |
353 | | // State mapping |
354 | | // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0 |
355 | | // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2 |
356 | | // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1 |
357 | | // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3 |
358 | |
|
359 | 0 | __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[0] ); |
360 | 0 | __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[2] ); |
361 | | |
362 | | //int64_t rdCostA = state.rdCost[m_stateId] + pqDataA.deltaDist; |
363 | | //int64_t rdCostB = state.rdCost[m_stateId] + pqDataB.deltaDist; |
364 | | //int64_t rdCostZ = state.rdCost[m_stateId]; |
365 | 0 | __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 ); |
366 | 0 | __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 ); |
367 | 0 | __m128i deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[2].deltaDist ), _mm_loadu_si64( &pqData[1].deltaDist ) ); |
368 | 0 | __m128i rdCostB01 = _mm_add_epi64( rdCostZ23, deltaDist ); |
369 | 0 | __m128i rdCostB23 = _mm_add_epi64( rdCostZ01, deltaDist ); |
370 | 0 | deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[0].deltaDist ), _mm_loadu_si64( &pqData[3].deltaDist ) ); |
371 | 0 | __m128i rdCostA01 = _mm_add_epi64( rdCostZ01, deltaDist ); |
372 | 0 | __m128i rdCostA23 = _mm_add_epi64( rdCostZ23, deltaDist ); |
373 | | |
374 | | //const CoeffFracBits &cffBits = m_gtxFracBitsArray[state.ctx.cff[m_stateId]]; |
375 | | //const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]]; |
376 | | // |
377 | | //rdCostA += cffBits.bits[ pqDataA.absLevel ]; |
378 | | //rdCostB += cffBits.bits[ pqDataB.absLevel ]; |
379 | 0 | __m128i sgbts02 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[0][state.ctx.sig[0]] ), |
380 | 0 | _mm_loadu_si64( &state.m_sigFracBitsArray[2][state.ctx.sig[2]] ) ); |
381 | 0 | __m128i sgbts13 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[1][state.ctx.sig[1]] ), |
382 | 0 | _mm_loadu_si64( &state.m_sigFracBitsArray[3][state.ctx.sig[3]] ) ); |
383 | |
|
384 | 0 | { |
385 | 0 | __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); |
386 | 0 | __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
387 | 0 | __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); |
388 | 0 | __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
389 | |
|
390 | 0 | sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 ); |
391 | 0 | sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 ); |
392 | 0 | } |
393 | |
|
394 | 0 | { |
395 | | // coeff context is indepndent of state |
396 | 0 | auto& base = state.m_gtxFracBitsArray; |
397 | |
|
398 | 0 | int32_t cffBitsArr[4] = |
399 | 0 | { |
400 | 0 | base[state.ctx.cff[1]].bits[pqData[2].absLevel], |
401 | 0 | base[state.ctx.cff[3]].bits[pqData[1].absLevel], |
402 | 0 | base[state.ctx.cff[0]].bits[pqData[2].absLevel], |
403 | 0 | base[state.ctx.cff[2]].bits[pqData[1].absLevel], |
404 | 0 | }; |
405 | |
|
406 | 0 | __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr ); |
407 | 0 | __m128i add = _mm_cvtepi32_epi64( cffBits ); |
408 | 0 | rdCostB01 = _mm_add_epi64( rdCostB01, add ); |
409 | 0 | add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ); |
410 | 0 | rdCostB23 = _mm_add_epi64( rdCostB23, add ); |
411 | 0 | } |
412 | |
|
413 | 0 | { |
414 | | // coeff context is indepndent of state |
415 | 0 | auto& base = state.m_gtxFracBitsArray; |
416 | |
|
417 | 0 | int32_t cffBitsArr[4] = |
418 | 0 | { |
419 | 0 | base[state.ctx.cff[0]].bits[pqData[0].absLevel], |
420 | 0 | base[state.ctx.cff[2]].bits[pqData[3].absLevel], |
421 | 0 | base[state.ctx.cff[1]].bits[pqData[0].absLevel], |
422 | 0 | base[state.ctx.cff[3]].bits[pqData[3].absLevel], |
423 | 0 | }; |
424 | |
|
425 | 0 | __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr ); |
426 | 0 | __m128i add = _mm_cvtepi32_epi64( cffBits ); |
427 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, add ); |
428 | 0 | add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ); |
429 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, add ); |
430 | 0 | } |
431 | |
|
432 | 0 | if( spt == DQIntern::SCAN_ISCSBB ) |
433 | 0 | { |
434 | | // rdCostZ += sigBits.intBits[ 0 ]; |
435 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); |
436 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); |
437 | |
|
438 | 0 | sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); |
439 | 0 | sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); |
440 | | |
441 | | // rdCostB += sigBits.intBits[ 1 ]; |
442 | 0 | rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) ); |
443 | 0 | rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) ); |
444 | | |
445 | | // rdCostA += sigBits.intBits[ 1 ]; |
446 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) ); |
447 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) ); |
448 | 0 | } |
449 | 0 | else if( spt == DQIntern::SCAN_SOCSBB ) |
450 | 0 | { |
451 | | // rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; |
452 | | // rdCostB += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; |
453 | | // rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ]; |
454 | 0 | __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 ); |
455 | 0 | sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
456 | |
|
457 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); |
458 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); |
459 | |
|
460 | 0 | __m128i add = _mm_cvtepi32_epi64( sbbBits ); |
461 | 0 | rdCostB23 = _mm_add_epi64( rdCostB23, add ); |
462 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, add ); |
463 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, add ); |
464 | 0 | add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) ); |
465 | 0 | rdCostB01 = _mm_add_epi64( rdCostB01, add ); |
466 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, add ); |
467 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, add ); |
468 | |
|
469 | 0 | sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); |
470 | 0 | sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); |
471 | 0 | rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) ); |
472 | 0 | rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) ); |
473 | |
|
474 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) ); |
475 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) ); |
476 | 0 | } |
477 | 0 | else |
478 | 0 | { |
479 | | //else if( state.numSig[m_stateId] ) |
480 | | //{ |
481 | | // rdCostA += sigBits.intBits[ 1 ]; |
482 | | // rdCostB += sigBits.intBits[ 1 ]; |
483 | | // rdCostZ += sigBits.intBits[ 0 ]; |
484 | | //} |
485 | | //else |
486 | | //{ |
487 | | // rdCostZ = decisionA.rdCost; |
488 | | //} |
489 | 0 | __m128i numSig = _mm_loadu_si32( state.numSig ); |
490 | |
|
491 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); |
492 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); |
493 | |
|
494 | 0 | __m128i mask13 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) ); |
495 | 0 | mask13 = _mm_cmpgt_epi8( mask13, _mm_setzero_si128() ); |
496 | 0 | __m128i mask02 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) ); |
497 | 0 | mask02 = _mm_cmpgt_epi8( mask02, _mm_setzero_si128() ); |
498 | |
|
499 | 0 | sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); |
500 | 0 | sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); |
501 | |
|
502 | 0 | rdCostB01 = _mm_add_epi64( rdCostB01, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) ); |
503 | 0 | rdCostB23 = _mm_add_epi64( rdCostB23, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) ); |
504 | |
|
505 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) ); |
506 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) ); |
507 | |
|
508 | 0 | __m128i rdMax = _mm_loadu_si64( &DQIntern::rdCostInit ); |
509 | 0 | rdMax = _mm_unpacklo_epi64( rdMax, rdMax ); |
510 | |
|
511 | 0 | rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask02 ); |
512 | 0 | rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask13 ); |
513 | 0 | } |
514 | | // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0 |
515 | | // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2 |
516 | | // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1 |
517 | | // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3 |
518 | | // Z0, or A0, or B0 |
519 | | // Z1, or A1, or B1 |
520 | | // B2, or Z2, or A2 |
521 | | // B3, or Z3, or A3 |
522 | |
|
523 | 0 | __m128i rdBest01 = rdCostZ01; |
524 | 0 | __m128i rdBest23 = rdCostB23; |
525 | |
|
526 | 0 | __m128i valBest = _mm_setr_epi32( 0, 0, pqData[2].absLevel, pqData[1].absLevel ); |
527 | |
|
528 | | #if ENABLE_VALGRIND_CODE |
529 | | // just to avoid strange "unknown instruction" error |
530 | | __m128i valCand = _mm_setr_epi32( 0, pqData[3].absLevel, 0, 0 ); |
531 | | valCand = _mm_insert_epi32( valCand, pqData[0].absLevel, 0 ); |
532 | | #else |
533 | 0 | __m128i valCand = _mm_setr_epi32( pqData[0].absLevel, pqData[3].absLevel, 0, 0 ); |
534 | 0 | #endif |
535 | 0 | __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 ); |
536 | 0 | __m128i idxCand = _mm_setr_epi32( 0, 2, 1, 3 ); |
537 | |
|
538 | 0 | __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 ); |
539 | 0 | __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 ); |
540 | 0 | __m128i chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011 |
541 | 0 | chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
542 | |
|
543 | 0 | rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 ); |
544 | 0 | rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 ); |
545 | |
|
546 | 0 | valBest = _mm_blendv_epi8( valBest, valCand, chng ); |
547 | 0 | idxBest = _mm_blendv_epi8( idxBest, idxCand, chng ); |
548 | | |
549 | |
|
550 | 0 | valCand = _mm_setr_epi32( pqData[2].absLevel, pqData[1].absLevel, pqData[0].absLevel, pqData[3].absLevel ); |
551 | 0 | idxCand = _mm_setr_epi32( 1, 3, 1, 3 ); |
552 | |
|
553 | 0 | chng01 = _my_cmpgt_epi64( rdBest01, rdCostB01 ); |
554 | 0 | chng23 = _my_cmpgt_epi64( rdBest23, rdCostA23 ); |
555 | 0 | chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011 |
556 | 0 | chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
557 | |
|
558 | 0 | rdBest01 = _mm_blendv_epi8( rdBest01, rdCostB01, chng01 ); |
559 | 0 | rdBest23 = _mm_blendv_epi8( rdBest23, rdCostA23, chng23 ); |
560 | |
|
561 | 0 | valBest = _mm_blendv_epi8( valBest, valCand, chng ); |
562 | 0 | idxBest = _mm_blendv_epi8( idxBest, idxCand, chng ); |
563 | | |
564 | |
|
565 | 0 | valBest = _mm_packs_epi32( valBest, _mm_setzero_si128() ); |
566 | 0 | idxBest = _mm_packs_epi32( idxBest, _mm_setzero_si128() ); |
567 | 0 | idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() ); |
568 | | |
569 | |
|
570 | 0 | _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[0], rdBest01 ); |
571 | 0 | _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[2], rdBest23 ); |
572 | |
|
573 | 0 | _mm_storeu_si64( decisions.absLevel, valBest ); |
574 | 0 | _mm_storeu_si32( decisions.prevId, idxBest ); |
575 | 0 | } Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::checkAllRdCosts<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanPosType, vvenc::DQIntern::PQData const*, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&) Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::checkAllRdCosts<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanPosType, vvenc::DQIntern::PQData const*, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&) Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::checkAllRdCosts<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanPosType, vvenc::DQIntern::PQData const*, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&) |
576 | | |
577 | | // has to be called as a first check, assumes no decision has been made yet!!! |
578 | | template<X86_VEXT vext> |
579 | | static inline void checkAllRdCostsOdd1( const DQIntern::ScanPosType spt, const int64_t pq_a_dist, const int64_t pq_b_dist, DQIntern::Decisions& decisions, const DQIntern::StateMem& state ) |
580 | 0 | { |
581 | | // State mapping |
582 | | // decision 0: either 1 from 1 (pqData[2]), or 0 from 0 |
583 | | // decision 1: either 1 from 3 (pqData[1]), or 0 from 2 |
584 | | // decision 2: either 1 from 0 (pqData[2]), or 0 from 1 |
585 | | // decision 3: either 1 from 2 (pqData[1]), or 0 from 3 |
586 | |
|
587 | 0 | __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[0] ); |
588 | 0 | __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[2] ); |
589 | | |
590 | | //int64_t rdCostA = state.rdCost[m_stateId] + pqDataA.deltaDist; // done |
591 | | //int64_t rdCostZ = state.rdCost[m_stateId]; // done |
592 | 0 | __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 ); |
593 | 0 | __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 ); |
594 | 0 | __m128i deltaDist = _mm_unpacklo_epi64( _mm_cvtsi64_si128( pq_b_dist ), _mm_cvtsi64_si128( pq_a_dist ) ); |
595 | 0 | __m128i rdCostA01 = _mm_add_epi64( rdCostZ23, deltaDist ); |
596 | 0 | __m128i rdCostA23 = _mm_add_epi64( rdCostZ01, deltaDist ); |
597 | | |
598 | | //const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]]; |
599 | | // |
600 | | //rdCostA += m_gtxFracBitsArray[state.ctx.cff[m_stateId]].bits[1]; // done |
601 | | // |
602 | 0 | __m128i sgbts02 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[0][state.ctx.sig[0]] ), |
603 | 0 | _mm_loadu_si64( &state.m_sigFracBitsArray[2][state.ctx.sig[2]] ) ); |
604 | 0 | __m128i sgbts13 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[1][state.ctx.sig[1]] ), |
605 | 0 | _mm_loadu_si64( &state.m_sigFracBitsArray[3][state.ctx.sig[3]] ) ); |
606 | |
|
607 | 0 | { |
608 | 0 | __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); |
609 | 0 | __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
610 | 0 | __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); |
611 | 0 | __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
612 | |
|
613 | 0 | sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 ); |
614 | 0 | sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 ); |
615 | 0 | } |
616 | |
|
617 | 0 | { |
618 | | #if USE_AVX2 |
619 | | __m128i cffidx = _mm_cvtepi8_epi32( _mm_loadu_si32( &state.ctx.cff ) ); |
620 | | cffidx = _mm_shuffle_epi32( cffidx, ( 1 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); |
621 | | cffidx = _mm_sub_epi8( cffidx, _mm_set1_epi32( state.cffBitsCtxOffset ) ); |
622 | | __m256i cffBits256 = _mm256_loadu_si256( ( const __m256i* ) & state.cffBits1[state.cffBitsCtxOffset] ); |
623 | | cffBits256 = _mm256_permutevar8x32_epi32( cffBits256, _mm256_castsi128_si256( cffidx ) ); |
624 | | __m128i cffBits = _mm256_castsi256_si128( cffBits256 ); |
625 | | #else |
626 | | __m128i cffBits; |
627 | | __m128i bits0123 = _mm_loadu_si128( ( const __m128i* ) & state.cffBits1[state.cffBitsCtxOffset + 0] ); |
628 | | __m128i bits4 = _mm_loadu_si32( &state.cffBits1[state.cffBitsCtxOffset + 4] ); |
629 | | __m128i cfCtxIdx = _mm_loadu_si32( &state.ctx.cff ); |
630 | | cfCtxIdx = _mm_cvtepi8_epi32( cfCtxIdx ); |
631 | | cfCtxIdx = _mm_sub_epi8( cfCtxIdx, _mm_set1_epi32( state.cffBitsCtxOffset ) ); |
632 | | cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 1 ) ); |
633 | | cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 2 ) ); |
634 | | cfCtxIdx = _mm_slli_epi32( cfCtxIdx, 2 ); |
635 | | cfCtxIdx = _mm_add_epi8( cfCtxIdx, _mm_setr_epi8( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ) ); |
636 | | cffBits = _mm_shuffle_epi8( bits4, _mm_sub_epi8( cfCtxIdx, _mm_set1_epi8( 16 ) ) ); |
637 | | cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_cmpgt_epi8( cfCtxIdx, _mm_set1_epi8( 15 ) ) ); |
638 | | cffBits = _mm_or_si128( cffBits, _mm_shuffle_epi8( bits0123, cfCtxIdx ) ); |
639 | | cffBits = _mm_shuffle_epi32( cffBits, ( 1 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); |
640 | | #endif |
641 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( cffBits ) ); |
642 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ) ); |
643 | 0 | } |
644 | |
|
645 | 0 | if( spt == DQIntern::SCAN_ISCSBB ) |
646 | 0 | { |
647 | | // rdCostZ += sigBits.intBits[ 0 ]; // done |
648 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); |
649 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); |
650 | |
|
651 | 0 | sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); |
652 | 0 | sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); |
653 | | |
654 | | // rdCostA += sigBits.intBits[ 1 ]; // done |
655 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) ); |
656 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) ); |
657 | 0 | } |
658 | 0 | else if( spt == DQIntern::SCAN_SOCSBB ) |
659 | 0 | { |
660 | | // rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ]; // done |
661 | | // rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; // dome |
662 | 0 | __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 ); |
663 | 0 | sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
664 | |
|
665 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); |
666 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); |
667 | |
|
668 | 0 | __m128i add = _mm_cvtepi32_epi64( sbbBits ); |
669 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, add ); |
670 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, add ); |
671 | 0 | add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) ); |
672 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, add ); |
673 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, add ); |
674 | |
|
675 | 0 | sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); |
676 | 0 | sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); |
677 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) ); |
678 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) ); |
679 | 0 | } |
680 | 0 | else |
681 | 0 | { |
682 | | //else if( m_numSigSbb ) |
683 | | //{ |
684 | | // rdCostA += sigBits.intBits[ 1 ]; // done |
685 | | // rdCostZ += sigBits.intBits[ 0 ]; // done |
686 | | //} |
687 | | //else |
688 | | //{ |
689 | | // rdCostZ = decisionZ.rdCost; // done |
690 | | //} |
691 | |
|
692 | 0 | __m128i numSig = _mm_loadu_si32( state.numSig ); |
693 | |
|
694 | 0 | rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); |
695 | 0 | rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); |
696 | |
|
697 | 0 | __m128i mask01 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) ); |
698 | 0 | mask01 = _mm_cmpgt_epi8( mask01, _mm_setzero_si128() ); |
699 | 0 | __m128i mask23 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) ); |
700 | 0 | mask23 = _mm_cmpgt_epi8( mask23, _mm_setzero_si128() ); |
701 | 0 | sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); |
702 | 0 | sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); |
703 | 0 | rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask01, _mm_cvtepi32_epi64( sgbts13 ) ) ); |
704 | 0 | rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask23, _mm_cvtepi32_epi64( sgbts02 ) ) ); |
705 | |
|
706 | 0 | __m128i rdMax = _mm_loadu_si64( &DQIntern::rdCostInit ); |
707 | 0 | rdMax = _mm_unpacklo_epi64( rdMax, rdMax ); |
708 | |
|
709 | 0 | rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask23 ); |
710 | 0 | rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask01 ); |
711 | 0 | } |
712 | | |
713 | | //// decision 0: either 1 from 1 (pqData[2]), or 0 from 0 |
714 | | //// decision 1: either 1 from 3 (pqData[1]), or 0 from 2 |
715 | | //// decision 2: either 1 from 0 (pqData[2]), or 0 from 1 |
716 | | //// decision 3: either 1 from 2 (pqData[1]), or 0 from 3 |
717 | | |
718 | | // d0: Z0, or A0 |
719 | | // d1: Z1, or A1 |
720 | | // d2: A2, or Z2 |
721 | | // d3: A3, or Z3 |
722 | |
|
723 | 0 | __m128i rdBest01 = rdCostZ01; |
724 | 0 | __m128i rdBest23 = rdCostA23; |
725 | |
|
726 | 0 | __m128i valBest = _mm_setr_epi32( 0, 0, 1, 1 ); |
727 | 0 | __m128i valCand = _mm_setr_epi32( 1, 1, 0, 0 ); |
728 | |
|
729 | 0 | __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 ); |
730 | 0 | __m128i idxCand = _mm_setr_epi32( 1, 3, 1, 3 ); |
731 | |
|
732 | 0 | __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 ); |
733 | 0 | __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 ); |
734 | 0 | __m128i chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011 |
735 | 0 | chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); |
736 | |
|
737 | 0 | rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 ); |
738 | 0 | rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 ); |
739 | |
|
740 | 0 | _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[0], rdBest01 ); |
741 | 0 | _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[2], rdBest23 ); |
742 | |
|
743 | 0 | valBest = _mm_packs_epi32( _mm_blendv_epi8( valBest, valCand, chng ), _mm_setzero_si128() ); |
744 | 0 | idxBest = _mm_packs_epi32( _mm_blendv_epi8( idxBest, idxCand, chng ), _mm_setzero_si128() ); |
745 | 0 | idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() ); |
746 | |
|
747 | 0 | _mm_storeu_si64( decisions.absLevel, valBest ); |
748 | 0 | _mm_storeu_si32( decisions.prevId, idxBest ); |
749 | 0 | } Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::checkAllRdCostsOdd1<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanPosType, long, long, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&) Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::checkAllRdCostsOdd1<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanPosType, long, long, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&) Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::checkAllRdCostsOdd1<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanPosType, long, long, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&) |
750 | | |
751 | | template<X86_VEXT vext> |
752 | | void findFirstPos( int& firstTestPos, const TCoeff* tCoeff, const DQIntern::TUParameters& tuPars, int defaultTh, bool zeroOutForThres, int zeroOutWidth, int zeroOutHeight ) |
753 | 0 | { |
754 | 0 | if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 ) |
755 | 0 | { |
756 | 0 | const int sbbSize = tuPars.m_sbbSize; |
757 | | // move the pointer to the beginning of the current subblock |
758 | 0 | firstTestPos -= ( sbbSize - 1 ); |
759 | |
|
760 | 0 | const __m128i xdfTh = _mm_set1_epi32( defaultTh ); |
761 | | |
762 | | // for each subblock |
763 | 0 | for( ; firstTestPos >= 0; firstTestPos -= sbbSize ) |
764 | 0 | { |
765 | | // skip zeroed out blocks |
766 | | // for 64-point transformation the coding order takes care of that |
767 | 0 | if( zeroOutForThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) |
768 | 0 | { |
769 | 0 | continue; |
770 | 0 | } |
771 | | |
772 | | // read first line of the subblock and check for coefficients larger than the threshold |
773 | | // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width |
774 | 0 | int pos = tuPars.m_scanId2BlkPos[firstTestPos].idx; |
775 | 0 | __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) ); |
776 | 0 | __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh ); |
777 | | |
778 | | // same for the next line in the subblock |
779 | 0 | pos += tuPars.m_width; |
780 | 0 | xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) ); |
781 | 0 | xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); |
782 | | |
783 | | // and the third line |
784 | 0 | pos += tuPars.m_width; |
785 | 0 | xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) ); |
786 | 0 | xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); |
787 | | |
788 | | // and the last line |
789 | 0 | pos += tuPars.m_width; |
790 | 0 | xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) ); |
791 | 0 | xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); |
792 | | |
793 | | // if any of the 16 comparisons were true, break, because this subblock contains a coefficient larger than threshold |
794 | 0 | if( !_mm_testz_si128( xdf, xdf ) ) break; |
795 | 0 | } |
796 | |
|
797 | 0 | if( firstTestPos >= 0 ) |
798 | 0 | { |
799 | | // if a coefficient was found, advance the pointer to the end of the current subblock |
800 | | // for the subsequent coefficient-wise refinement (C-impl after endif) |
801 | 0 | firstTestPos += sbbSize - 1; |
802 | 0 | } |
803 | 0 | } |
804 | 0 | } Unexecuted instantiation: void vvenc::DQInternSimd::findFirstPos<(vvenc::x86_simd::X86_VEXT)1>(int&, int const*, vvenc::DQIntern::TUParameters const&, int, bool, int, int) Unexecuted instantiation: void vvenc::DQInternSimd::findFirstPos<(vvenc::x86_simd::X86_VEXT)2>(int&, int const*, vvenc::DQIntern::TUParameters const&, int, bool, int, int) Unexecuted instantiation: void vvenc::DQInternSimd::findFirstPos<(vvenc::x86_simd::X86_VEXT)4>(int&, int const*, vvenc::DQIntern::TUParameters const&, int, bool, int, int) |
805 | | }; |
806 | | |
807 | | template<X86_VEXT vext> |
808 | | void DepQuant::_initDepQuantX86() |
809 | 0 | { |
810 | 0 | m_checkAllRdCosts = DQInternSimd::checkAllRdCosts<vext>; |
811 | 0 | m_checkAllRdCostsOdd1 = DQInternSimd::checkAllRdCostsOdd1<vext>; |
812 | 0 | m_updateStatesEOS = DQInternSimd::updateStatesEOS<vext>; |
813 | 0 | m_updateStates = DQInternSimd::updateStates<vext>; |
814 | 0 | m_findFirstPos = DQInternSimd::findFirstPos<vext>; |
815 | 0 | } Unexecuted instantiation: void vvenc::DepQuant::_initDepQuantX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::DepQuant::_initDepQuantX86<(vvenc::x86_simd::X86_VEXT)2>() Unexecuted instantiation: void vvenc::DepQuant::_initDepQuantX86<(vvenc::x86_simd::X86_VEXT)4>() |
816 | | template void DepQuant::_initDepQuantX86<SIMDX86>(); |
817 | | |
818 | | }; // namespace vvenc |
819 | | |
820 | | //! \} |
821 | | |
822 | | #endif //ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 ) |
823 | | |