Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/DepQuantX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
#include "DepQuant.h"
44
45
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_QUANT
46
47
#  include "x86/CommonDefX86.h"
48
#  include <simde/x86/sse4.1.h>
49
#if defined( USE_SSE41 ) || !defined( REAL_TARGET_X86 )
50
#  include <simde/x86/sse4.2.h>
51
#endif
52
53
#include <bitset>
54
55
//! \ingroup CommonLib
56
//! \{
57
namespace vvenc {
58
59
#if USE_SSE41 && defined( REAL_TARGET_X86 )
60
0
#define _my_cmpgt_epi64( a, b ) simde_mm_cmpgt_epi64( a, b )
61
#else
62
0
#define _my_cmpgt_epi64( a, b ) _mm_cmpgt_epi64( a, b )
63
#endif
64
65
  namespace DQInternSimd
66
  {
67
    template<X86_VEXT vext>
68
    static inline void updateStates( const DQIntern::ScanInfo& scanInfo, const DQIntern::Decisions& decisions, DQIntern::StateMem& curr )
69
0
    {
70
0
      int8_t s[4] = { 0 }, t[4] = { 0 }, l[4] = { 0 };
71
72
0
      __m128i v126_4 = _mm_setr_epi16( 126, 126, 126, 126, 4, 4, 4, 4 );
73
0
      __m128i v01 = _mm_setr_epi16( 1, 1, 1, 1, 1, 1, 1, 1 );
74
0
      __m128i v032 = _mm_setr_epi8( 0, 0, 0, 0, 32, 32, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0 );
75
0
      __m128i vn1 = _mm_set1_epi8( -1 );
76
77
0
      static_assert( sizeof( curr.rdCost ) == sizeof( decisions.rdCost ), "Non-matching array size" );
78
0
      memcpy( curr.rdCost, decisions.rdCost, sizeof( decisions.rdCost ) );
79
80
      // in signalling, the coeffs are always max 16 bit!
81
0
      __m128i v = _mm_loadu_si64( decisions.absLevel );
82
0
      v = _mm_unpacklo_epi64( v, v );
83
0
      __m128i p = _mm_loadu_si32( decisions.prevId );
84
0
      _mm_storeu_si32( s, p ); // store previous state indexes
85
0
      p = _mm_shuffle_epi32( p, 0 );
86
0
      __m128i n2 = _mm_cmplt_epi8( p, vn1 );
87
0
      __m128i a_1 = _mm_and_si128( v, v01 );
88
0
      __m128i a_m = _mm_min_epi16( v, _mm_add_epi16( v126_4, a_1 ) );
89
0
      a_m = _mm_packs_epi16( a_m, vn1 );
90
0
      a_m = _mm_or_si128( a_m, _mm_sign_epi8( v032, a_m ) );
91
0
      a_m = _mm_andnot_si128( n2, a_m );
92
0
      _mm_storeu_si32( l, a_m ); // store abs value
93
0
      a_m = _mm_shuffle_epi32( a_m, 1 );
94
0
      _mm_storeu_si32( t, a_m ); // store store capped abs value
95
96
0
      {
97
0
        const int ctxSize = 16 * 4;
98
0
        const int regSize = 16;
99
100
0
        __m128i vshuf = _mm_loadu_si32( s );
101
0
        vshuf = _mm_shuffle_epi32( vshuf, 0 );
102
0
        __m128i vshufmask = _mm_cmplt_epi8( vshuf, _mm_setzero_si128() );
103
0
        vshuf = _mm_add_epi8( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) );
104
0
        vshuf = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask );
105
106
0
        auto* tplAcc = &curr.tplAcc[0][0];
107
0
        auto* absVal = &curr.absVal[0][0];
108
0
        auto* sum1st = &curr.sum1st[0][0];
109
110
0
        for( int i = 0; i < ctxSize; i += regSize )
111
0
        {
112
0
          __m128i vtpl = _mm_loadu_si128( ( const __m128i* ) &tplAcc[i]);
113
0
          vtpl = _mm_shuffle_epi8( vtpl, vshuf );
114
0
          _mm_storeu_si128( ( __m128i* ) &tplAcc[i], vtpl );
115
116
0
          __m128i vval = _mm_loadu_si128( ( const __m128i* ) &absVal[i] );
117
0
          vval = _mm_shuffle_epi8( vval, vshuf );
118
0
          _mm_storeu_si128( ( __m128i* ) &absVal[i], vval );
119
120
0
          __m128i vsum = _mm_loadu_si128( ( const __m128i* ) &sum1st[i] );
121
0
          vsum = _mm_shuffle_epi8( vsum, vshuf );
122
0
          _mm_storeu_si128( ( __m128i* ) &sum1st[i], vsum );
123
0
        }
124
125
0
        __m128i numSig = _mm_loadu_si32( curr.numSig );
126
0
        numSig = _mm_shuffle_epi8( numSig, vshuf );
127
0
        __m128i lvls = _mm_loadu_si32( l );
128
0
        lvls = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() );
129
0
        numSig = _mm_subs_epi8( numSig, lvls );
130
0
        _mm_storeu_si32( curr.numSig, numSig );
131
132
0
        __m128i rsc = _mm_loadu_si32( curr.refSbbCtxId );
133
0
        rsc = _mm_shuffle_epi8( rsc, vshuf );
134
0
        rsc = _mm_blendv_epi8( rsc, vshuf, vshuf );
135
0
        _mm_storeu_si32( curr.refSbbCtxId, rsc );
136
137
0
        vshuf = _mm_shuffle_epi8( vshuf, _mm_setr_epi8( 0, 0, 1, 1, 2, 2, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1 ) );
138
0
        vshuf = _mm_slli_epi16( vshuf, 1 );
139
0
        vshuf = _mm_add_epi8( vshuf,
140
0
                              _mm_blendv_epi8( _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 ),
141
0
                              _mm_setzero_si128(),
142
0
                              vshuf ) );
143
144
0
        __m128i rrb = _mm_loadu_si64( ( const __m128i* ) curr.remRegBins );
145
0
        rrb = _mm_shuffle_epi8( rrb, vshuf );
146
0
        rrb = _mm_sub_epi16( rrb, v01 );
147
0
        rrb = _mm_blendv_epi8( rrb, _mm_set1_epi16( curr.initRemRegBins ), vshuf );
148
0
        __m128i mlvl = _mm_loadu_si32( l );
149
0
        __m128i mbins = _mm_min_epi8( mlvl, _mm_set1_epi8( 2 ) );
150
0
        __m128i mlutb = _mm_setr_epi8( 0, 1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
151
0
        rrb = _mm_sub_epi16( rrb, _mm_cvtepi8_epi16( _mm_shuffle_epi8( mlutb, mbins ) ) );
152
0
        _mm_storeu_si64( ( __m128i* ) curr.remRegBins, rrb );
153
0
        rrb = _mm_cmplt_epi16( rrb, _mm_set1_epi16( 4 ) );
154
155
0
        curr.anyRemRegBinsLt4 = !!_mm_cvtsi128_si64( rrb );
156
157
0
        __m128i lvl1 = _mm_loadu_si32( l );
158
0
        __m128i tpl1 = _mm_loadu_si32( t );
159
160
0
        auto update_deps_vec = [&]( int k )
161
0
        {
162
0
          int addr = scanInfo.currNbInfoSbb.invInPos[k];
163
164
0
          __m128i msum = _mm_loadu_si32( &curr.sum1st[addr][0] );
165
0
          msum = _mm_adds_epu8( msum, mlvl );
166
0
          _mm_storeu_si32( &curr.sum1st[addr][0], msum);
167
168
0
          __m128i tpl = _mm_loadu_si32( &curr.tplAcc[addr][0] );
169
0
          tpl = _mm_add_epi8( tpl, tpl1 );
170
0
          _mm_storeu_si32( &curr.tplAcc[addr][0], tpl);
171
0
        };
Unexecuted instantiation: DepQuant_sse41.cpp:vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)::{lambda(int)#1}::operator()(int) const
Unexecuted instantiation: DepQuant_sse42.cpp:vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)::{lambda(int)#1}::operator()(int) const
Unexecuted instantiation: DepQuant_avx2.cpp:vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)::{lambda(int)#1}::operator()(int) const
172
173
0
        switch( scanInfo.currNbInfoSbb.numInv )
174
0
        {
175
0
        default:
176
0
        case 5:
177
0
          update_deps_vec( 4 );
178
0
        case 4:
179
0
          update_deps_vec( 3 );
180
0
        case 3:
181
0
          update_deps_vec( 2 );
182
0
        case 2:
183
0
          update_deps_vec( 1 );
184
0
        case 1:
185
0
          update_deps_vec( 0 );
186
0
        case 0:
187
0
          ;
188
0
        }
189
190
0
        _mm_storeu_si32( &curr.absVal[scanInfo.insidePos][0], lvl1);
191
0
      }
192
193
0
      {
194
0
        __m128i tplAcc = _mm_loadu_si32( &curr.tplAcc[scanInfo.nextInsidePos][0]);
195
196
0
        __m128i sumAbs1 = _mm_and_si128( tplAcc, _mm_set1_epi8( 31 ) );
197
0
        __m128i sumNum = _mm_and_si128( _mm_srli_epi32( tplAcc, 5 ), _mm_set1_epi8( 7 ) );
198
0
        __m128i sumGt1 = _mm_sub_epi8( sumAbs1, sumNum );
199
0
        sumGt1 = _mm_min_epi8( sumGt1, _mm_set1_epi8( 4 ) );
200
0
        sumGt1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.gtxCtxOffsetNext ), sumGt1 );
201
0
        _mm_storeu_si32( curr.ctx.cff, sumGt1 );
202
203
0
        sumAbs1 = _mm_add_epi8( sumAbs1, _mm_set1_epi8( 1 ) );
204
0
        sumAbs1 = _mm_srli_epi32( sumAbs1, 1 );
205
0
        sumAbs1 = _mm_and_si128( sumAbs1, _mm_set1_epi8( 127 ) );
206
0
        sumAbs1 = _mm_min_epi8( sumAbs1, _mm_set1_epi8( 3 ) );
207
0
        sumAbs1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.sigCtxOffsetNext ), sumAbs1 );
208
0
        _mm_storeu_si32( curr.ctx.sig, sumAbs1 );
209
210
0
        curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext;
211
0
      }
212
0
    }
Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)
Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)
Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::updateStates<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem&)
213
214
    template<X86_VEXT vext>
215
    static inline void updateStatesEOS( const DQIntern::ScanInfo& scanInfo, const DQIntern::Decisions& decisions, const DQIntern::StateMem& skip, DQIntern::StateMem& curr, DQIntern::CommonCtx& commonCtx )
216
0
    {
217
0
      int8_t s[4] = { 0 }, l[4] = { 0 }, z[4] = { 0 };
218
0
      for( int i = 0; i < 4; ++i )
219
0
      {
220
0
        s[i] = decisions.prevId[i] >= 4 ? -2 : decisions.prevId[i];
221
0
        l[i] = s[i] > -2 ? std::min<int>( decisions.absLevel[i], 126 + ( decisions.absLevel[i] & 1 ) ) : 0;
222
0
        z[i] = 3 - decisions.prevId[i];
223
0
        curr.rdCost[i] = decisions.rdCost[i];
224
0
      }
225
0
      {
226
0
        const int ctxSize = 16 * 4;
227
0
        const int regSize = 16;
228
229
0
        __m128i vshuf = _mm_loadu_si32( s );
230
0
        vshuf = _mm_shuffle_epi32( vshuf, 0 );
231
0
        __m128i vshufmask = _mm_cmplt_epi8( vshuf, _mm_setzero_si128() );
232
0
        vshuf = _mm_add_epi8( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) );
233
0
        vshuf = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask );
234
235
0
        auto* absVal = &curr.absVal[0][0];
236
237
0
        for( int i = 0; i < ctxSize; i += regSize )
238
0
        {
239
0
          __m128i vval = _mm_loadu_si128( ( const __m128i* ) &absVal[i] );
240
0
          vval = _mm_shuffle_epi8( vval, vshuf );
241
0
          _mm_storeu_si128( ( __m128i* ) &absVal[i], vval );
242
0
        }
243
244
0
        __m128i numSig = _mm_loadu_si32( curr.numSig );
245
0
        numSig = _mm_shuffle_epi8( numSig, vshuf );
246
0
        __m128i lvls = _mm_loadu_si32( l );
247
0
        _mm_storeu_si32( &curr.absVal[scanInfo.insidePos][0], lvls);
248
0
        lvls = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() );
249
0
        numSig = _mm_subs_epi8( numSig, lvls );
250
0
        _mm_storeu_si32( curr.numSig, numSig );
251
252
0
        __m128i rsc = _mm_loadu_si32( curr.refSbbCtxId );
253
0
        rsc = _mm_shuffle_epi8( rsc, vshuf );
254
0
        rsc = _mm_blendv_epi8( rsc, vshuf, vshuf );
255
0
        _mm_storeu_si32( curr.refSbbCtxId, rsc );
256
257
0
        vshuf = _mm_shuffle_epi8( vshuf, _mm_setr_epi8( 0, 0, 1, 1, 2, 2, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1 ) );
258
0
        vshuf = _mm_slli_epi16( vshuf, 1 );
259
0
        vshuf = _mm_add_epi8( vshuf,
260
0
                              _mm_blendv_epi8( _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 ),
261
0
                              _mm_setzero_si128(),
262
0
                              vshuf ) );
263
264
0
        __m128i rrb = _mm_loadu_si64( ( const __m128i* ) curr.remRegBins );
265
0
        rrb = _mm_shuffle_epi8( rrb, vshuf );
266
0
        rrb = _mm_sub_epi16( rrb, _mm_set1_epi16( 1 ) );
267
0
        rrb = _mm_blendv_epi8( rrb, _mm_set1_epi16( curr.initRemRegBins ), vshuf );
268
269
0
        __m128i vskip = _mm_cvtepi8_epi16( _mm_loadu_si32( z ) );
270
0
        rrb = _mm_blendv_epi8( rrb, _mm_loadu_si64( ( const __m128i* ) skip.remRegBins ), vskip );
271
272
0
        __m128i mlvl = _mm_loadu_si32( l );
273
0
        __m128i mbins = _mm_min_epi8( mlvl, _mm_set1_epi8( 2 ) );
274
0
        __m128i mlutb = _mm_setr_epi8( 0, 1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
275
0
        rrb = _mm_sub_epi16( rrb, _mm_cvtepi8_epi16( _mm_shuffle_epi8( mlutb, mbins ) ) );
276
0
        _mm_storeu_si64( ( __m128i* ) curr.remRegBins, rrb );
277
0
        rrb = _mm_cmplt_epi16( rrb, _mm_set1_epi16( 4 ) );
278
279
0
        curr.anyRemRegBinsLt4 = !!_mm_cvtsi128_si64( rrb );
280
0
      }
281
282
0
      {
283
0
        uint8_t* levels0;
284
0
        uint8_t* levels1;
285
0
        uint8_t* levels2;
286
0
        uint8_t* levels3;
287
288
0
        commonCtx.getLevelPtrs( scanInfo, levels0, levels1, levels2, levels3 );
289
290
0
        const int regSize = 16;
291
0
        const int ctxSize = scanInfo.sbbSize << 2;
292
293
0
        const __m128i vshuf0 = _mm_setr_epi8( 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
294
0
        const __m128i vshuf1 = _mm_setr_epi8( 1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
295
0
        const __m128i vshuf2 = _mm_setr_epi8( 2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
296
0
        const __m128i vshuf3 = _mm_setr_epi8( 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
297
298
0
        auto* absVal = &curr.absVal[0][0];
299
300
0
        for( int i = 0, j = 0; i < ctxSize; i += regSize, j += 4 )
301
0
        {
302
0
          __m128i in = _mm_loadu_si128( ( const __m128i* ) &absVal[i] );
303
304
0
          _mm_storeu_si32( &levels0[j], _mm_shuffle_epi8( in, vshuf0 ) );
305
0
          _mm_storeu_si32( &levels1[j], _mm_shuffle_epi8( in, vshuf1 ) );
306
0
          _mm_storeu_si32( &levels2[j], _mm_shuffle_epi8( in, vshuf2 ) );
307
0
          _mm_storeu_si32( &levels3[j], _mm_shuffle_epi8( in, vshuf3 ) );
308
0
        }
309
0
      }
310
311
0
      memset( curr.absVal, 0, sizeof( curr.absVal ) );
312
0
      memset( curr.tplAcc, 0, sizeof( curr.tplAcc ) );
313
0
      memset( curr.sum1st, 0, sizeof( curr.sum1st ) );
314
315
0
      for( int i = 0; i < 4; i++ )
316
0
      {
317
0
        int prevId = decisions.prevId[i];
318
319
0
        if( prevId > -2 )
320
0
        {
321
0
          const int refId = prevId < 0 ? -1 : ( prevId < 4 ? curr.refSbbCtxId[i] : prevId - 4 );
322
0
          commonCtx.update( scanInfo, refId, i, curr );
323
0
        }
324
0
      }
325
326
0
      memset( curr.numSig, 0, sizeof( curr.numSig ) );
327
328
0
      {
329
0
        __m128i tplAcc = _mm_loadu_si32( &curr.tplAcc[scanInfo.nextInsidePos][0]);
330
331
0
        __m128i sumAbs1 = _mm_and_si128( tplAcc, _mm_set1_epi8( 31 ) );
332
0
        __m128i sumNum = _mm_and_si128( _mm_srli_epi32( tplAcc, 5 ), _mm_set1_epi8( 7 ) );
333
0
        __m128i sumGt1 = _mm_sub_epi8( sumAbs1, sumNum );
334
0
        sumGt1 = _mm_min_epi8( sumGt1, _mm_set1_epi8( 4 ) );
335
0
        sumGt1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.gtxCtxOffsetNext ), sumGt1 );
336
0
        _mm_storeu_si32( curr.ctx.cff, sumGt1 );
337
338
0
        sumAbs1 = _mm_add_epi8( sumAbs1, _mm_set1_epi8( 1 ) );
339
0
        sumAbs1 = _mm_srli_epi32( sumAbs1, 1 );
340
0
        sumAbs1 = _mm_and_si128( sumAbs1, _mm_set1_epi8( 127 ) );
341
0
        sumAbs1 = _mm_min_epi8( sumAbs1, _mm_set1_epi8( 3 ) );
342
0
        sumAbs1 = _mm_add_epi8( _mm_set1_epi8( scanInfo.sigCtxOffsetNext ), sumAbs1 );
343
0
        _mm_storeu_si32( curr.ctx.sig, sumAbs1 );
344
345
0
        curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext;
346
0
      }
347
0
    }
Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::updateStatesEOS<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem const&, vvenc::DQIntern::StateMem&, vvenc::DQIntern::CommonCtx&)
Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::updateStatesEOS<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem const&, vvenc::DQIntern::StateMem&, vvenc::DQIntern::CommonCtx&)
Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::updateStatesEOS<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanInfo const&, vvenc::DQIntern::Decisions const&, vvenc::DQIntern::StateMem const&, vvenc::DQIntern::StateMem&, vvenc::DQIntern::CommonCtx&)
348
349
    // has to be called as a first check, assumes no decision has been made yet
350
    template<X86_VEXT vext>
351
    static void checkAllRdCosts( const DQIntern::ScanPosType spt, const DQIntern::PQData* pqData, DQIntern::Decisions& decisions, const DQIntern::StateMem& state )
352
0
    {
353
      // State mapping
354
      // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0
355
      // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2
356
      // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1
357
      // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3
358
359
0
      __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[0] );
360
0
      __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[2] );
361
362
      //int64_t         rdCostA   = state.rdCost[m_stateId] + pqDataA.deltaDist;
363
      //int64_t         rdCostB   = state.rdCost[m_stateId] + pqDataB.deltaDist;
364
      //int64_t         rdCostZ   = state.rdCost[m_stateId];
365
0
      __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 );
366
0
      __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 );
367
0
      __m128i deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[2].deltaDist ), _mm_loadu_si64( &pqData[1].deltaDist ) );
368
0
      __m128i rdCostB01 = _mm_add_epi64( rdCostZ23, deltaDist );
369
0
      __m128i rdCostB23 = _mm_add_epi64( rdCostZ01, deltaDist );
370
0
      deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[0].deltaDist ), _mm_loadu_si64( &pqData[3].deltaDist ) );
371
0
      __m128i rdCostA01 = _mm_add_epi64( rdCostZ01, deltaDist );
372
0
      __m128i rdCostA23 = _mm_add_epi64( rdCostZ23, deltaDist );
373
374
      //const CoeffFracBits &cffBits = m_gtxFracBitsArray[state.ctx.cff[m_stateId]];
375
      //const BinFracBits    sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]];
376
      //
377
      //rdCostA += cffBits.bits[ pqDataA.absLevel ];
378
      //rdCostB += cffBits.bits[ pqDataB.absLevel ];
379
0
      __m128i sgbts02 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[0][state.ctx.sig[0]] ),
380
0
                                            _mm_loadu_si64( &state.m_sigFracBitsArray[2][state.ctx.sig[2]] ) );
381
0
      __m128i sgbts13 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[1][state.ctx.sig[1]] ),
382
0
                                            _mm_loadu_si64( &state.m_sigFracBitsArray[3][state.ctx.sig[3]] ) );
383
384
0
      {
385
0
        __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
386
0
        __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
387
0
        __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
388
0
        __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
389
390
0
        sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 );
391
0
        sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 );
392
0
      }
393
394
0
      {
395
        // coeff context is indepndent of state
396
0
        auto& base = state.m_gtxFracBitsArray;
397
398
0
        int32_t cffBitsArr[4] =
399
0
        {
400
0
          base[state.ctx.cff[1]].bits[pqData[2].absLevel],
401
0
          base[state.ctx.cff[3]].bits[pqData[1].absLevel],
402
0
          base[state.ctx.cff[0]].bits[pqData[2].absLevel],
403
0
          base[state.ctx.cff[2]].bits[pqData[1].absLevel],
404
0
        };
405
406
0
        __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr );
407
0
        __m128i add = _mm_cvtepi32_epi64( cffBits );
408
0
        rdCostB01 = _mm_add_epi64( rdCostB01, add );
409
0
        add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) );
410
0
        rdCostB23 = _mm_add_epi64( rdCostB23, add );
411
0
      }
412
413
0
      {
414
        // coeff context is indepndent of state
415
0
        auto& base = state.m_gtxFracBitsArray;
416
417
0
        int32_t cffBitsArr[4] =
418
0
        {
419
0
          base[state.ctx.cff[0]].bits[pqData[0].absLevel],
420
0
          base[state.ctx.cff[2]].bits[pqData[3].absLevel],
421
0
          base[state.ctx.cff[1]].bits[pqData[0].absLevel],
422
0
          base[state.ctx.cff[3]].bits[pqData[3].absLevel],
423
0
        };
424
425
0
        __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr );
426
0
        __m128i add = _mm_cvtepi32_epi64( cffBits );
427
0
        rdCostA01 = _mm_add_epi64( rdCostA01, add );
428
0
        add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) );
429
0
        rdCostA23 = _mm_add_epi64( rdCostA23, add );
430
0
      }
431
432
0
      if( spt == DQIntern::SCAN_ISCSBB )
433
0
      {
434
        //  rdCostZ += sigBits.intBits[ 0 ];
435
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
436
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
437
438
0
        sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 );
439
0
        sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 );
440
441
        //  rdCostB += sigBits.intBits[ 1 ];
442
0
        rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) );
443
0
        rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) );
444
445
        //  rdCostA += sigBits.intBits[ 1 ];
446
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) );
447
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) );
448
0
      }
449
0
      else if( spt == DQIntern::SCAN_SOCSBB )
450
0
      {
451
        //  rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ];
452
        //  rdCostB += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ];
453
        //  rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ];
454
0
        __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 );
455
0
        sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
456
457
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
458
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
459
460
0
        __m128i add = _mm_cvtepi32_epi64( sbbBits );
461
0
        rdCostB23 = _mm_add_epi64( rdCostB23, add );
462
0
        rdCostA01 = _mm_add_epi64( rdCostA01, add );
463
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, add );
464
0
        add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) );
465
0
        rdCostB01 = _mm_add_epi64( rdCostB01, add );
466
0
        rdCostA23 = _mm_add_epi64( rdCostA23, add );
467
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, add );
468
469
0
        sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 );
470
0
        sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 );
471
0
        rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) );
472
0
        rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) );
473
474
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) );
475
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) );
476
0
      }
477
0
      else
478
0
      {
479
        //else if( state.numSig[m_stateId] )
480
        //{
481
        //  rdCostA += sigBits.intBits[ 1 ];
482
        //  rdCostB += sigBits.intBits[ 1 ];
483
        //  rdCostZ += sigBits.intBits[ 0 ];
484
        //}
485
        //else
486
        //{
487
        //  rdCostZ = decisionA.rdCost;
488
        //}
489
0
        __m128i numSig = _mm_loadu_si32( state.numSig );
490
491
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
492
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
493
494
0
        __m128i mask13 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) );
495
0
        mask13 = _mm_cmpgt_epi8( mask13, _mm_setzero_si128() );
496
0
        __m128i mask02 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) );
497
0
        mask02 = _mm_cmpgt_epi8( mask02, _mm_setzero_si128() );
498
499
0
        sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 );
500
0
        sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 );
501
502
0
        rdCostB01 = _mm_add_epi64( rdCostB01, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) );
503
0
        rdCostB23 = _mm_add_epi64( rdCostB23, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) );
504
505
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) );
506
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) );
507
508
0
        __m128i rdMax = _mm_loadu_si64( &DQIntern::rdCostInit );
509
0
        rdMax = _mm_unpacklo_epi64( rdMax, rdMax );
510
511
0
        rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask02 );
512
0
        rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask13 );
513
0
      }
514
      // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0
515
      // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2
516
      // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1
517
      // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3
518
      // Z0, or A0, or B0
519
      // Z1, or A1, or B1
520
      // B2, or Z2, or A2
521
      // B3, or Z3, or A3
522
523
0
      __m128i rdBest01 = rdCostZ01;
524
0
      __m128i rdBest23 = rdCostB23;
525
526
0
      __m128i valBest = _mm_setr_epi32( 0, 0, pqData[2].absLevel, pqData[1].absLevel );
527
528
#if ENABLE_VALGRIND_CODE
529
      // just to avoid strange "unknown instruction"  error
530
      __m128i valCand = _mm_setr_epi32( 0, pqData[3].absLevel, 0, 0 );
531
      valCand = _mm_insert_epi32( valCand, pqData[0].absLevel, 0 );
532
#else
533
0
      __m128i valCand = _mm_setr_epi32( pqData[0].absLevel, pqData[3].absLevel, 0, 0 );
534
0
#endif
535
0
      __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 );
536
0
      __m128i idxCand = _mm_setr_epi32( 0, 2, 1, 3 );
537
538
0
      __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 );
539
0
      __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 );
540
0
      __m128i chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011
541
0
      chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
542
543
0
      rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 );
544
0
      rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 );
545
546
0
      valBest = _mm_blendv_epi8( valBest, valCand, chng );
547
0
      idxBest = _mm_blendv_epi8( idxBest, idxCand, chng );
548
549
550
0
      valCand = _mm_setr_epi32( pqData[2].absLevel, pqData[1].absLevel, pqData[0].absLevel, pqData[3].absLevel );
551
0
      idxCand = _mm_setr_epi32( 1, 3, 1, 3 );
552
553
0
      chng01 = _my_cmpgt_epi64( rdBest01, rdCostB01 );
554
0
      chng23 = _my_cmpgt_epi64( rdBest23, rdCostA23 );
555
0
      chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011
556
0
      chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
557
558
0
      rdBest01 = _mm_blendv_epi8( rdBest01, rdCostB01, chng01 );
559
0
      rdBest23 = _mm_blendv_epi8( rdBest23, rdCostA23, chng23 );
560
561
0
      valBest = _mm_blendv_epi8( valBest, valCand, chng );
562
0
      idxBest = _mm_blendv_epi8( idxBest, idxCand, chng );
563
564
565
0
      valBest = _mm_packs_epi32( valBest, _mm_setzero_si128() );
566
0
      idxBest = _mm_packs_epi32( idxBest, _mm_setzero_si128() );
567
0
      idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() );
568
569
570
0
      _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[0], rdBest01 );
571
0
      _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[2], rdBest23 );
572
573
0
      _mm_storeu_si64( decisions.absLevel, valBest );
574
0
      _mm_storeu_si32( decisions.prevId, idxBest );
575
0
    }
Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::checkAllRdCosts<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanPosType, vvenc::DQIntern::PQData const*, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&)
Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::checkAllRdCosts<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanPosType, vvenc::DQIntern::PQData const*, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&)
Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::checkAllRdCosts<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanPosType, vvenc::DQIntern::PQData const*, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&)
576
577
    // has to be called as a first check, assumes no decision has been made yet!!!
578
    template<X86_VEXT vext>
579
    static inline void checkAllRdCostsOdd1( const DQIntern::ScanPosType spt, const int64_t pq_a_dist, const int64_t pq_b_dist, DQIntern::Decisions& decisions, const DQIntern::StateMem& state )
580
0
    {
581
      // State mapping
582
      // decision 0: either 1 from 1 (pqData[2]), or 0 from 0
583
      // decision 1: either 1 from 3 (pqData[1]), or 0 from 2
584
      // decision 2: either 1 from 0 (pqData[2]), or 0 from 1
585
      // decision 3: either 1 from 2 (pqData[1]), or 0 from 3
586
587
0
      __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[0] );
588
0
      __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) & state.rdCost[2] );
589
590
      //int64_t         rdCostA   = state.rdCost[m_stateId] + pqDataA.deltaDist; // done
591
      //int64_t         rdCostZ   = state.rdCost[m_stateId]; // done
592
0
      __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 );
593
0
      __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 );
594
0
      __m128i deltaDist = _mm_unpacklo_epi64( _mm_cvtsi64_si128( pq_b_dist ), _mm_cvtsi64_si128( pq_a_dist ) );
595
0
      __m128i rdCostA01 = _mm_add_epi64( rdCostZ23, deltaDist );
596
0
      __m128i rdCostA23 = _mm_add_epi64( rdCostZ01, deltaDist );
597
598
      //const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]];
599
      //
600
      //rdCostA += m_gtxFracBitsArray[state.ctx.cff[m_stateId]].bits[1]; // done
601
      //
602
0
      __m128i sgbts02 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[0][state.ctx.sig[0]] ),
603
0
                                            _mm_loadu_si64( &state.m_sigFracBitsArray[2][state.ctx.sig[2]] ) );
604
0
      __m128i sgbts13 = _mm_unpacklo_epi64( _mm_loadu_si64( &state.m_sigFracBitsArray[1][state.ctx.sig[1]] ),
605
0
                                            _mm_loadu_si64( &state.m_sigFracBitsArray[3][state.ctx.sig[3]] ) );
606
607
0
      {
608
0
        __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
609
0
        __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
610
0
        __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
611
0
        __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
612
613
0
        sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 );
614
0
        sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 );
615
0
      }
616
617
0
      {
618
#if USE_AVX2
619
        __m128i cffidx = _mm_cvtepi8_epi32( _mm_loadu_si32( &state.ctx.cff ) );
620
        cffidx = _mm_shuffle_epi32( cffidx, ( 1 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
621
        cffidx = _mm_sub_epi8( cffidx, _mm_set1_epi32( state.cffBitsCtxOffset ) );
622
        __m256i cffBits256 = _mm256_loadu_si256( ( const __m256i* ) & state.cffBits1[state.cffBitsCtxOffset] );
623
        cffBits256 = _mm256_permutevar8x32_epi32( cffBits256, _mm256_castsi128_si256( cffidx ) );
624
        __m128i cffBits = _mm256_castsi256_si128( cffBits256 );
625
#else
626
        __m128i cffBits;
627
        __m128i bits0123 = _mm_loadu_si128( ( const __m128i* ) & state.cffBits1[state.cffBitsCtxOffset + 0] );
628
        __m128i bits4 = _mm_loadu_si32( &state.cffBits1[state.cffBitsCtxOffset + 4] );
629
        __m128i cfCtxIdx = _mm_loadu_si32( &state.ctx.cff );
630
        cfCtxIdx = _mm_cvtepi8_epi32( cfCtxIdx );
631
        cfCtxIdx = _mm_sub_epi8( cfCtxIdx, _mm_set1_epi32( state.cffBitsCtxOffset ) );
632
        cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 1 ) );
633
        cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 2 ) );
634
        cfCtxIdx = _mm_slli_epi32( cfCtxIdx, 2 );
635
        cfCtxIdx = _mm_add_epi8( cfCtxIdx, _mm_setr_epi8( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ) );
636
        cffBits = _mm_shuffle_epi8( bits4, _mm_sub_epi8( cfCtxIdx, _mm_set1_epi8( 16 ) ) );
637
        cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_cmpgt_epi8( cfCtxIdx, _mm_set1_epi8( 15 ) ) );
638
        cffBits = _mm_or_si128( cffBits, _mm_shuffle_epi8( bits0123, cfCtxIdx ) );
639
        cffBits = _mm_shuffle_epi32( cffBits, ( 1 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
640
#endif
641
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( cffBits ) );
642
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ) );
643
0
      }
644
645
0
      if( spt == DQIntern::SCAN_ISCSBB )
646
0
      {
647
        //  rdCostZ += sigBits.intBits[ 0 ]; // done
648
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
649
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
650
651
0
        sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 );
652
0
        sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 );
653
654
        //  rdCostA += sigBits.intBits[ 1 ]; // done
655
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) );
656
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) );
657
0
      }
658
0
      else if( spt == DQIntern::SCAN_SOCSBB )
659
0
      {
660
        //  rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ]; // done
661
        //  rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; // dome
662
0
        __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 );
663
0
        sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
664
665
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
666
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
667
668
0
        __m128i add = _mm_cvtepi32_epi64( sbbBits );
669
0
        rdCostA23 = _mm_add_epi64( rdCostA23, add );
670
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, add );
671
0
        add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) );
672
0
        rdCostA01 = _mm_add_epi64( rdCostA01, add );
673
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, add );
674
675
0
        sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 );
676
0
        sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 );
677
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) );
678
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) );
679
0
      }
680
0
      else
681
0
      {
682
        //else if( m_numSigSbb )
683
        //{
684
        //  rdCostA += sigBits.intBits[ 1 ]; // done
685
        //  rdCostZ += sigBits.intBits[ 0 ]; // done
686
        //}
687
        //else
688
        //{
689
        //  rdCostZ = decisionZ.rdCost; // done
690
        //}
691
692
0
        __m128i numSig = _mm_loadu_si32( state.numSig );
693
694
0
        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
695
0
        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
696
697
0
        __m128i mask01 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) );
698
0
        mask01 = _mm_cmpgt_epi8( mask01, _mm_setzero_si128() );
699
0
        __m128i mask23 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) );
700
0
        mask23 = _mm_cmpgt_epi8( mask23, _mm_setzero_si128() );
701
0
        sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 );
702
0
        sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 );
703
0
        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask01, _mm_cvtepi32_epi64( sgbts13 ) ) );
704
0
        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask23, _mm_cvtepi32_epi64( sgbts02 ) ) );
705
706
0
        __m128i rdMax = _mm_loadu_si64( &DQIntern::rdCostInit );
707
0
        rdMax = _mm_unpacklo_epi64( rdMax, rdMax );
708
709
0
        rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask23 );
710
0
        rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask01 );
711
0
      }
712
713
      //// decision 0: either 1 from 1 (pqData[2]), or 0 from 0
714
      //// decision 1: either 1 from 3 (pqData[1]), or 0 from 2
715
      //// decision 2: either 1 from 0 (pqData[2]), or 0 from 1
716
      //// decision 3: either 1 from 2 (pqData[1]), or 0 from 3
717
718
      // d0: Z0, or A0
719
      // d1: Z1, or A1
720
      // d2: A2, or Z2
721
      // d3: A3, or Z3
722
723
0
      __m128i rdBest01 = rdCostZ01;
724
0
      __m128i rdBest23 = rdCostA23;
725
726
0
      __m128i valBest = _mm_setr_epi32( 0, 0, 1, 1 );
727
0
      __m128i valCand = _mm_setr_epi32( 1, 1, 0, 0 );
728
729
0
      __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 );
730
0
      __m128i idxCand = _mm_setr_epi32( 1, 3, 1, 3 );
731
732
0
      __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 );
733
0
      __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 );
734
0
      __m128i chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011
735
0
      chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
736
737
0
      rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 );
738
0
      rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 );
739
740
0
      _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[0], rdBest01 );
741
0
      _mm_storeu_si128( ( __m128i* ) & decisions.rdCost[2], rdBest23 );
742
743
0
      valBest = _mm_packs_epi32( _mm_blendv_epi8( valBest, valCand, chng ), _mm_setzero_si128() );
744
0
      idxBest = _mm_packs_epi32( _mm_blendv_epi8( idxBest, idxCand, chng ), _mm_setzero_si128() );
745
0
      idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() );
746
747
0
      _mm_storeu_si64( decisions.absLevel, valBest );
748
0
      _mm_storeu_si32( decisions.prevId, idxBest );
749
0
    }
Unexecuted instantiation: DepQuant_sse41.cpp:void vvenc::DQInternSimd::checkAllRdCostsOdd1<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DQIntern::ScanPosType, long, long, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&)
Unexecuted instantiation: DepQuant_sse42.cpp:void vvenc::DQInternSimd::checkAllRdCostsOdd1<(vvenc::x86_simd::X86_VEXT)2>(vvenc::DQIntern::ScanPosType, long, long, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&)
Unexecuted instantiation: DepQuant_avx2.cpp:void vvenc::DQInternSimd::checkAllRdCostsOdd1<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DQIntern::ScanPosType, long, long, vvenc::DQIntern::Decisions&, vvenc::DQIntern::StateMem const&)
750
751
    template<X86_VEXT vext>
752
    void findFirstPos( int& firstTestPos, const TCoeff* tCoeff, const DQIntern::TUParameters& tuPars, int defaultTh, bool zeroOutForThres, int zeroOutWidth, int zeroOutHeight )
753
0
    {
754
0
      if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 )
755
0
      {
756
0
        const int sbbSize = tuPars.m_sbbSize;
757
        // move the pointer to the beginning of the current subblock
758
0
        firstTestPos -= ( sbbSize - 1 );
759
760
0
        const __m128i xdfTh = _mm_set1_epi32( defaultTh );
761
762
        // for each subblock
763
0
        for( ; firstTestPos >= 0; firstTestPos -= sbbSize )
764
0
        {
765
          // skip zeroed out blocks
766
          // for 64-point transformation the coding order takes care of that
767
0
          if( zeroOutForThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) )
768
0
          {
769
0
            continue;
770
0
          }
771
772
          // read first line of the subblock and check for coefficients larger than the threshold
773
          // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width
774
0
          int pos = tuPars.m_scanId2BlkPos[firstTestPos].idx;
775
0
          __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) );
776
0
          __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh );
777
778
          // same for the next line in the subblock
779
0
          pos += tuPars.m_width;
780
0
          xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) );
781
0
          xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
782
783
          // and the third line
784
0
          pos += tuPars.m_width;
785
0
          xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) );
786
0
          xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
787
788
          // and the last line
789
0
          pos += tuPars.m_width;
790
0
          xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) & tCoeff[pos] ) );
791
0
          xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
792
793
          // if any of the 16 comparisons were true, break, because this subblock contains a coefficient larger than threshold
794
0
          if( !_mm_testz_si128( xdf, xdf ) ) break;
795
0
        }
796
797
0
        if( firstTestPos >= 0 )
798
0
        {
799
          // if a coefficient was found, advance the pointer to the end of the current subblock
800
          // for the subsequent coefficient-wise refinement (C-impl after endif)
801
0
          firstTestPos += sbbSize - 1;
802
0
        }
803
0
      }
804
0
    }
Unexecuted instantiation: void vvenc::DQInternSimd::findFirstPos<(vvenc::x86_simd::X86_VEXT)1>(int&, int const*, vvenc::DQIntern::TUParameters const&, int, bool, int, int)
Unexecuted instantiation: void vvenc::DQInternSimd::findFirstPos<(vvenc::x86_simd::X86_VEXT)2>(int&, int const*, vvenc::DQIntern::TUParameters const&, int, bool, int, int)
Unexecuted instantiation: void vvenc::DQInternSimd::findFirstPos<(vvenc::x86_simd::X86_VEXT)4>(int&, int const*, vvenc::DQIntern::TUParameters const&, int, bool, int, int)
805
  };
806
807
template<X86_VEXT vext>
808
void DepQuant::_initDepQuantX86()
809
0
{
810
0
  m_checkAllRdCosts     = DQInternSimd::checkAllRdCosts<vext>;
811
0
  m_checkAllRdCostsOdd1 = DQInternSimd::checkAllRdCostsOdd1<vext>;
812
0
  m_updateStatesEOS     = DQInternSimd::updateStatesEOS<vext>;
813
0
  m_updateStates        = DQInternSimd::updateStates<vext>;
814
0
  m_findFirstPos        = DQInternSimd::findFirstPos<vext>;
815
0
}
Unexecuted instantiation: void vvenc::DepQuant::_initDepQuantX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::DepQuant::_initDepQuantX86<(vvenc::x86_simd::X86_VEXT)2>()
Unexecuted instantiation: void vvenc::DepQuant::_initDepQuantX86<(vvenc::x86_simd::X86_VEXT)4>()
816
template void DepQuant::_initDepQuantX86<SIMDX86>();
817
818
}; // namespace vvenc
819
820
//! \}
821
822
#endif //ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
823