Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/QuantX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     QuantX86.h
44
    \brief    SIMD for Quant/Dequant
45
*/
46
47
#include "CommonLib/CommonDef.h"
48
#include "CommonDefX86.h"
49
#include "CommonLib/Quant.h"
50
51
namespace vvdec
52
{
53
54
#if ENABLE_SIMD_OPT_QUANT
55
#ifdef TARGET_SIMD_X86
56
57
#if USE_AVX2 && !defined( _mm256_set_m128i )
58
#define VVCLIB_OWN_mm256_set_m128i
59
0
#define _mm256_set_m128i( v0, v1 ) _mm256_inserti128_si256( _mm256_castsi128_si256( v1 ), ( v0 ), 1 )
60
61
#endif
62
63
static constexpr unsigned short levmask[16] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0};
64
#if USE_AVX2
65
static constexpr unsigned short xlevmask[32] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
66
#endif
67
68
template< X86_VEXT vext>
69
static void DeQuantScalingCoreSIMD(const int maxX,const int restX,const int maxY,const int scaleQP,const int *piDequantCoef,const TCoeffSig*const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
70
0
{
71
0
  const int inputMinimum = -(inputMaximum+1);
72
0
  const TCoeff transformMinimum = -(transformMaximum+1);
73
0
  const int width = restX+maxX+1;
74
0
  __m128i vlevmask;
75
0
  if (maxX<7)
76
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
77
0
  else
78
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
79
#if USE_AVX2
80
  __m256i xvlevmask;
81
0
  if (maxX<15)
82
0
    xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] );
83
0
  else
84
0
    xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff);
85
#endif
86
0
  if (rightShift>0)
87
0
  {
88
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
89
90
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
91
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
92
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
93
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
94
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
95
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
96
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
97
0
    if (maxX<4)
98
0
    {
99
0
      for( int y = 0; y <= maxY; y++)
100
0
      {
101
0
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
102
0
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
103
0
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
104
105
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
106
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
107
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
108
109
0
        v_level = _mm_max_epi16 (v_level, v_min);
110
0
        v_level = _mm_min_epi16 (v_level, v_max);
111
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
112
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
113
114
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);  
115
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
116
0
        v_level =  _mm_add_epi32(v_level,v_add);
117
0
        v_level = _mm_sra_epi32(v_level,v_rshift);        
118
119
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
120
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
121
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
122
0
      }
123
0
    }
124
#if USE_AVX2
125
0
    else if (maxX<8)
126
0
    {
127
0
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
128
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
129
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
130
131
0
      for( int y = 0; y <= maxY; y++)
132
0
      {
133
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
134
0
        __m256i v_scale = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width] );
135
136
0
        v_level = _mm_max_epi16 (v_level, v_min);
137
0
        v_level = _mm_min_epi16 (v_level, v_max);
138
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
139
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
140
0
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
141
0
        xv_level = _mm256_mullo_epi32(xv_level,v_scale);
142
0
        xv_level =  _mm256_add_epi32(xv_level,xv_add);
143
0
        xv_level = _mm256_sra_epi32(xv_level,v_rshift);
144
0
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
145
0
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
146
0
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
147
0
      }
148
0
    }
149
0
    else
150
0
    {
151
0
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
152
0
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
153
0
      __m256i xv_scaleQP = _mm256_set1_epi16 ((short)scaleQP);
154
0
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
155
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
156
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
157
158
0
      __m256i v_scale_l,v_scale_h;
159
0
      for( int y = 0; y <= maxY; y++)
160
0
      {
161
0
        for( int x = 0; x <= maxX; x+=16)
162
0
        {
163
0
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
164
0
          v_scale_l = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x ]);
165
0
          v_scale_h = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x + 8 ]);
166
167
0
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
168
0
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
169
0
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
170
0
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scaleQP);
171
0
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scaleQP);
172
173
0
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
174
0
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
175
176
0
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
177
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_l);
178
179
0
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
180
0
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
181
182
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
183
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
184
185
0
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
186
187
0
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
188
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_h);
189
190
0
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
191
0
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
192
193
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
194
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
195
196
0
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
197
0
        }
198
0
      }
199
0
    }
200
#else
201
0
    else
202
0
    {        
203
0
      __m128i v_scale_l,v_scale_h;
204
0
      for( int y = 0; y <= maxY; y++)
205
0
      {
206
0
        for( int x = 0; x <= maxX; x+=8)
207
0
        {
208
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
209
210
          v_scale_l = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
211
          v_level = _mm_and_si128(v_level,vlevmask);
212
          v_level = _mm_max_epi16 (v_level, v_min);
213
          v_level = _mm_min_epi16 (v_level, v_max);
214
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
215
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
216
217
          v_level = _mm_unpacklo_epi16(v_low,v_high);
218
          v_level = _mm_mullo_epi32(v_level,v_scale_l);
219
          v_level =  _mm_add_epi32(v_level,v_add);
220
          v_level = _mm_sra_epi32(v_level,v_rshift);        
221
222
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
223
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
224
0
          _mm_storeu_si128(( __m128i * )(piCoef+y*width +x), v_level );
225
0
          if (maxX + 1 - x > 4)
226
0
          {
227
0
            v_scale_h = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4] );
228
0
            v_level = _mm_unpackhi_epi16(v_low,v_high);
229
0
            v_level = _mm_mullo_epi32(v_level,v_scale_h);
230
0
            v_level =  _mm_add_epi32(v_level,v_add);
231
0
            v_level = _mm_sra_epi32(v_level,v_rshift);
232
0
            v_level = _mm_max_epi32 (v_level, v_Tmin);
233
0
            v_level = _mm_min_epi32 (v_level, v_Tmax);
234
0
            _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
235
0
          }
236
0
        }
237
0
      }
238
0
    }
239
#endif
240
0
  }
241
0
  else  // rightshift <0
242
0
  {
243
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
244
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
245
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
246
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
247
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
248
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
249
0
    if (maxX<4)
250
0
    {
251
0
      for( int y = 0; y <= maxY; y++)
252
0
      {
253
0
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
254
0
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
255
0
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
256
257
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
258
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
259
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
260
0
        v_level = _mm_max_epi16 (v_level, v_min);
261
0
        v_level = _mm_min_epi16 (v_level, v_max);
262
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
263
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
264
265
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
266
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
267
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
268
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
269
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
270
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
271
0
      }
272
0
    }
273
#if USE_AVX2
274
0
    else if (maxX<8)
275
0
    {
276
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
277
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
278
0
      for( int y = 0; y <= maxY; y++)
279
0
      {
280
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
281
0
        __m256i v_scale = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width] );
282
283
        v_level = _mm_and_si128(v_level,vlevmask);
284
        v_level = _mm_max_epi16 (v_level, v_min);
285
        v_level = _mm_min_epi16 (v_level, v_max);
286
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
287
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
288
0
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
289
0
        xv_level = _mm256_mullo_epi32(xv_level,v_scale);
290
0
        xv_level = _mm256_sll_epi32(xv_level,v_lshift);
291
0
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
292
0
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
293
0
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
294
0
      }
295
0
    }
296
0
    else
297
0
    {
298
0
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
299
0
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
300
0
      __m256i xv_scaleQP = _mm256_set1_epi16 ((short)scaleQP);
301
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
302
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
303
0
      __m256i v_scale_l,v_scale_h;
304
305
0
      for( int y = 0; y <= maxY; y++)
306
0
      {
307
0
        for( int x = 0; x <= maxX; x+=16)
308
0
        {
309
0
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
310
0
          v_scale_l = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x ]);
311
0
          v_scale_h = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x + 8 ]);
312
313
0
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
314
0
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
315
0
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
316
0
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scaleQP);
317
0
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scaleQP);
318
319
0
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
320
0
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
321
322
0
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
323
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_l);
324
325
0
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
326
327
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
328
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
329
330
0
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
331
0
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
332
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_h);
333
334
0
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
335
336
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
337
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
338
339
0
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
340
0
        }
341
0
      }
342
0
    }
343
#else
344
0
    else
345
0
    {
346
0
      __m128i v_scale_l,v_scale_h;
347
0
      for( int y = 0; y <= maxY; y++)
348
0
      {
349
0
        for( int x = 0; x <= maxX; x+=8)
350
0
        {
351
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
352
353
          v_scale_l = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
354
          v_level = _mm_and_si128(v_level,vlevmask);
355
          v_level = _mm_max_epi16 (v_level, v_min);
356
          v_level = _mm_min_epi16 (v_level, v_max);
357
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
358
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
359
360
          v_level = _mm_unpacklo_epi16(v_low,v_high);
361
          v_level = _mm_mullo_epi32(v_level,v_scale_l);
362
          v_level = _mm_sll_epi32(v_level,v_lshift);
363
          v_level = _mm_max_epi32 (v_level, v_Tmin);
364
          v_level = _mm_min_epi32 (v_level, v_Tmax);
365
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
366
367
0
          if (maxX + 1 - x > 4)
368
0
          {
369
0
            v_scale_h = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4] );
370
0
            v_level = _mm_unpackhi_epi16(v_low,v_high);
371
0
            v_level = _mm_mullo_epi32(v_level,v_scale_h);
372
0
            v_level = _mm_sll_epi32(v_level,v_lshift);
373
0
            v_level = _mm_max_epi32 (v_level, v_Tmin);
374
0
            v_level = _mm_min_epi32 (v_level, v_Tmax);
375
0
            _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
376
0
          }
377
0
        }
378
0
      }
379
0
    }
380
#endif
381
0
  }
382
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
383
384
template< X86_VEXT vext>
385
static void DeQuantCoreSIMD(const int maxX,const int restX,const int maxY,const int scale,const TCoeffSig*const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
386
0
{
387
0
  const int inputMinimum = -(inputMaximum+1);
388
0
  const TCoeff transformMinimum = -(transformMaximum+1);
389
0
  const int width = restX+maxX+1;
390
0
  __m128i vlevmask;
391
0
  if (maxX<7)
392
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
393
0
  else
394
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
395
396
#if USE_AVX2
397
  __m256i xvlevmask;
398
0
  if (maxX<15)
399
0
    xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] );
400
0
  else
401
0
    xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff);
402
403
#endif
404
405
0
  if (rightShift>0)
406
0
  {
407
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
408
409
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
410
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
411
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
412
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
413
0
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
414
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
415
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
416
417
0
    if (maxX<4)
418
0
    {
419
0
      for( int y = 0; y <= maxY; y++)
420
0
      {
421
0
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
422
0
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
423
0
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
424
425
0
        v_level = _mm_max_epi16 (v_level, v_min);
426
0
        v_level = _mm_min_epi16 (v_level, v_max);
427
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
428
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
429
430
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
431
0
        v_level =  _mm_add_epi32(v_level,v_add);
432
0
        v_level = _mm_sra_epi32(v_level,v_rshift);
433
434
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
435
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
436
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
437
0
      }
438
0
    }
439
#if USE_AVX2
440
0
    else if (maxX<8)
441
0
    {
442
0
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
443
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
444
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
445
446
0
      for( int y = 0; y <= maxY; y++)
447
0
      {
448
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
449
0
        v_level = _mm_max_epi16 (v_level, v_min);
450
0
        v_level = _mm_min_epi16 (v_level, v_max);
451
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
452
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
453
454
0
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
455
456
0
        xv_level =  _mm256_add_epi32(xv_level,xv_add);
457
0
        xv_level = _mm256_sra_epi32(xv_level,v_rshift);
458
459
0
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
460
0
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
461
462
0
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
463
0
      }
464
0
    }
465
0
    else
466
0
    {
467
0
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
468
0
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
469
0
      __m256i xv_scale = _mm256_set1_epi16 ((short)scale);
470
0
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
471
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
472
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
473
474
0
      for( int y = 0; y <= maxY; y++)
475
0
      {
476
0
        for( int x = 0; x <= maxX; x+=16)
477
0
        {
478
0
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
479
0
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
480
0
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
481
0
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
482
0
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale);
483
0
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale);
484
485
0
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
486
0
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
487
488
489
0
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
490
0
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
491
0
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
492
493
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
494
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
495
496
0
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
497
0
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
498
0
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
499
0
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
500
501
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
502
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
503
504
0
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
505
0
        }
506
0
      }
507
508
0
    }
509
510
#else
511
/*
512
    else if (maxX<8)
513
    {
514
      for( int y = 0; y <= maxY; y++)
515
      {
516
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
517
        v_level = _mm_and_si128(v_level,vlevmask);
518
        v_level = _mm_max_epi16 (v_level, v_min);
519
        v_level = _mm_min_epi16 (v_level, v_max);
520
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
521
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
522
523
        v_level = _mm_unpacklo_epi16(v_low,v_high);
524
        v_level =  _mm_add_epi32(v_level,v_add);
525
        v_level = _mm_sra_epi32(v_level,v_rshift);
526
527
        v_level = _mm_max_epi32 (v_level, v_Tmin);
528
        v_level = _mm_min_epi32 (v_level, v_Tmax);
529
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
530
531
        v_level = _mm_unpackhi_epi16(v_low,v_high);
532
        v_level =  _mm_add_epi32(v_level,v_add);
533
        v_level = _mm_sra_epi32(v_level,v_rshift);
534
535
        v_level = _mm_max_epi32 (v_level, v_Tmin);
536
        v_level = _mm_min_epi32 (v_level, v_Tmax);
537
        _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level );
538
      }
539
    }
540
*/
541
0
    else
542
0
    {
543
0
      for( int y = 0; y <= maxY; y++)
544
0
      {
545
0
        for( int x = 0; x <= maxX; x+=8)
546
0
        {
547
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
548
0
          v_level = _mm_and_si128(v_level,vlevmask);
549
0
          v_level = _mm_max_epi16 (v_level, v_min);
550
0
          v_level = _mm_min_epi16 (v_level, v_max);
551
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
552
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
553
554
          v_level = _mm_unpacklo_epi16(v_low,v_high);
555
          v_level =  _mm_add_epi32(v_level,v_add);
556
          v_level = _mm_sra_epi32(v_level,v_rshift);
557
558
          v_level = _mm_max_epi32 (v_level, v_Tmin);
559
          v_level = _mm_min_epi32 (v_level, v_Tmax);
560
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
561
562
0
          if( maxX + 1 - x <= 4 ) continue;
563
564
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
565
0
          v_level =  _mm_add_epi32(v_level,v_add);
566
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
567
568
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
569
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
570
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
571
0
        }
572
0
      }
573
0
    }
574
#endif
575
0
  }
576
0
  else  // rightshift <0
577
0
  {
578
579
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
580
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
581
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
582
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
583
0
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
584
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
585
586
0
    if (maxX<4)
587
0
    {
588
0
      for( int y = 0; y <= maxY; y++)
589
0
      {
590
0
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
591
0
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
592
0
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
593
594
0
        v_level = _mm_max_epi16 (v_level, v_min);
595
0
        v_level = _mm_min_epi16 (v_level, v_max);
596
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
597
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
598
599
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
600
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
601
602
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
603
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
604
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
605
606
0
      }
607
0
    }
608
#if USE_AVX2
609
0
    else if (maxX<8)
610
0
    {
611
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
612
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
613
614
0
      for( int y = 0; y <= maxY; y++)
615
0
      {
616
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
617
0
        v_level = _mm_and_si128(v_level,vlevmask);
618
0
        v_level = _mm_max_epi16 (v_level, v_min);
619
0
        v_level = _mm_min_epi16 (v_level, v_max);
620
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
621
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
622
623
0
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
624
625
0
        xv_level = _mm256_sll_epi32(xv_level,v_lshift);
626
627
0
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
628
0
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
629
630
0
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
631
0
      }
632
0
    }
633
634
0
    else
635
0
    {
636
0
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
637
0
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
638
0
      __m256i xv_scale = _mm256_set1_epi16 ((short)scale);
639
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
640
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
641
642
0
      for( int y = 0; y <= maxY; y++)
643
0
      {
644
0
        for( int x = 0; x <= maxX; x+=16)
645
0
        {
646
0
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
647
0
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
648
0
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
649
0
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
650
0
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale);
651
0
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale);
652
653
0
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
654
0
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
655
656
657
0
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
658
0
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
659
660
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
661
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
662
663
0
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
664
0
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
665
0
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
666
667
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
668
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
669
670
0
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
671
0
        }
672
0
      }
673
0
    }
674
675
#else
676
/*
677
    else if (maxX<8)
678
    {
679
      for( int y = 0; y <= maxY; y++)
680
      {
681
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
682
        v_level = _mm_and_si128(v_level,vlevmask);
683
        v_level = _mm_max_epi16 (v_level, v_min);
684
        v_level = _mm_min_epi16 (v_level, v_max);
685
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
686
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
687
688
        v_level = _mm_unpacklo_epi16(v_low,v_high);
689
        v_level = _mm_sll_epi32(v_level,v_lshift);
690
691
        v_level = _mm_max_epi32 (v_level, v_Tmin);
692
        v_level = _mm_min_epi32 (v_level, v_Tmax);
693
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
694
695
        v_level = _mm_unpackhi_epi16(v_low,v_high);
696
        v_level = _mm_sll_epi32(v_level,v_lshift);
697
698
        v_level = _mm_max_epi32 (v_level, v_Tmin);
699
        v_level = _mm_min_epi32 (v_level, v_Tmax);
700
        _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level );
701
      }
702
    }
703
*/
704
0
    else
705
0
    {
706
0
      for( int y = 0; y <= maxY; y++)
707
0
      {
708
0
        for( int x = 0; x <= maxX; x+=8)
709
0
        {
710
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
711
0
          v_level = _mm_and_si128(v_level,vlevmask);
712
0
          v_level = _mm_max_epi16 (v_level, v_min);
713
0
          v_level = _mm_min_epi16 (v_level, v_max);
714
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
715
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
716
717
          v_level = _mm_unpacklo_epi16(v_low,v_high);
718
          v_level = _mm_sll_epi32(v_level,v_lshift);
719
720
          v_level = _mm_max_epi32 (v_level, v_Tmin);
721
          v_level = _mm_min_epi32 (v_level, v_Tmax);
722
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
723
724
0
          if( maxX + 1 - x <= 4 ) continue;
725
726
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
727
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
728
729
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
730
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
731
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
732
0
        }
733
0
      }
734
0
    }
735
#endif
736
0
  }
737
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, short const*, unsigned long, int*, int, int, int)
738
739
template< X86_VEXT vext>
740
static void DeQuantCorePCMSIMD(const int maxX,const int restX,const int maxY,const int scale,TCoeff   *const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
741
0
{
742
0
  const int inputMinimum = -(inputMaximum+1);
743
0
  const TCoeff transformMinimum = -(transformMaximum+1);
744
0
  const int width = restX+maxX+1;
745
0
  __m128i vlevmask;
746
0
  if (maxX<7)
747
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
748
0
  else
749
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
750
751
0
  if (rightShift>0)
752
0
  {
753
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
754
755
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
756
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
757
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
758
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
759
0
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
760
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
761
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
762
763
0
    if (maxX<4)
764
0
    {
765
0
      for( int y = 0; y <= maxY; y++)
766
0
      {
767
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
768
0
        v_level = _mm_packs_epi32 (v_level,v_level);
769
0
        v_level = _mm_and_si128(v_level,vlevmask);
770
0
        v_level = _mm_max_epi16 (v_level, v_min);
771
0
        v_level = _mm_min_epi16 (v_level, v_max);
772
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
773
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
774
775
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
776
0
        v_level =  _mm_add_epi32(v_level,v_add);
777
0
        v_level = _mm_sra_epi32(v_level,v_rshift);
778
779
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
780
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
781
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
782
0
      }
783
0
    }
784
0
    else
785
0
    {
786
0
      for( int y = 0; y <= maxY; y++)
787
0
      {
788
0
        for( int x = 0; x <= maxX; x+=8)
789
0
        {
790
791
0
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
792
0
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
793
0
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
794
0
          v_level = _mm_and_si128(v_level,vlevmask);
795
0
          v_level = _mm_max_epi16 (v_level, v_min);
796
0
          v_level = _mm_min_epi16 (v_level, v_max);
797
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
798
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
799
800
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
801
0
          v_level =  _mm_add_epi32(v_level,v_add);
802
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
803
804
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
805
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
806
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
807
808
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
809
0
          v_level =  _mm_add_epi32(v_level,v_add);
810
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
811
812
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
813
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
814
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
815
0
        }
816
0
      }
817
0
    }
818
0
  }
819
0
  else  // rightshift <0
820
0
  {
821
822
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
823
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
824
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
825
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
826
0
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
827
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
828
829
0
    if (maxX<4)
830
0
    {
831
0
      for( int y = 0; y <= maxY; y++)
832
0
      {
833
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
834
0
        v_level = _mm_packs_epi32 (v_level,v_level);
835
0
        v_level = _mm_and_si128(v_level,vlevmask);
836
837
0
        v_level = _mm_max_epi16 (v_level, v_min);
838
0
        v_level = _mm_min_epi16 (v_level, v_max);
839
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
840
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
841
842
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
843
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
844
845
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
846
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
847
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
848
0
      }
849
0
    }
850
0
    else
851
0
    {
852
0
      for( int y = 0; y <= maxY; y++)
853
0
      {
854
0
        for( int x = 0; x <= maxX; x+=8)
855
0
        {
856
0
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
857
0
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
858
0
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
859
0
          v_level = _mm_and_si128(v_level,vlevmask);
860
0
          v_level = _mm_max_epi16 (v_level, v_min);
861
0
          v_level = _mm_min_epi16 (v_level, v_max);
862
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
863
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
864
865
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
866
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
867
868
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
869
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
870
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
871
872
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
873
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
874
875
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
876
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
877
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
878
0
        }
879
0
      }
880
0
    }
881
0
  }
882
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCorePCMSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantCorePCMSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int*, unsigned long, int*, int, int, int)
883
884
template< X86_VEXT vext>
885
static void DeQuantScalingPCMCoreSIMD(const int maxX,const int restX,const int maxY,const int scaleQP,const int     *piDequantCoef,TCoeff   *const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
886
0
{
887
0
  const int inputMinimum = -(inputMaximum+1);
888
0
  const TCoeff transformMinimum = -(transformMaximum+1);
889
0
  const int width = restX+maxX+1;
890
0
  __m128i vlevmask;
891
0
  if (maxX<7)
892
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
893
0
  else
894
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
895
896
0
  if (rightShift>0)
897
0
  {
898
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
899
900
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
901
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
902
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
903
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
904
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
905
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
906
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
907
908
0
    if (maxX<4)
909
0
    {
910
0
      for( int y = 0; y <= maxY; y++)
911
0
      {
912
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
913
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
914
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
915
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
916
917
0
        v_level = _mm_packs_epi32 (v_level,v_level);
918
0
        v_level = _mm_and_si128(v_level,vlevmask);
919
0
        v_level = _mm_max_epi16 (v_level, v_min);
920
0
        v_level = _mm_min_epi16 (v_level, v_max);
921
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
922
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
923
924
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
925
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
926
927
0
        v_level =  _mm_add_epi32(v_level,v_add);
928
0
        v_level = _mm_sra_epi32(v_level,v_rshift);
929
930
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
931
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
932
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
933
0
      }
934
0
    }
935
0
    else
936
0
    {
937
0
      for( int y = 0; y <= maxY; y++)
938
0
      {
939
0
        for( int x = 0; x <= maxX; x+=8)
940
0
        {
941
0
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
942
0
          __m128i v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
943
0
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
944
0
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
945
946
0
          v_level = _mm_and_si128(v_level,vlevmask);
947
0
          v_level = _mm_max_epi16 (v_level, v_min);
948
0
          v_level = _mm_min_epi16 (v_level, v_max);
949
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
950
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
951
952
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
953
954
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
955
956
0
          v_level =  _mm_add_epi32(v_level,v_add);
957
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
958
959
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
960
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
961
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
962
963
0
          v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4]);
964
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
965
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
966
0
          v_level =  _mm_add_epi32(v_level,v_add);
967
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
968
969
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
970
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
971
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
972
0
        }
973
0
      }
974
0
    }
975
0
  }
976
0
  else  // rightshift <0
977
0
  {
978
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
979
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
980
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
981
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
982
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
983
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
984
985
0
    if (maxX<4)
986
0
    {
987
0
      for( int y = 0; y <= maxY; y++)
988
0
      {
989
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
990
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
991
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
992
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
993
994
0
        v_level = _mm_packs_epi32 (v_level,v_level);
995
0
        v_level = _mm_and_si128(v_level,vlevmask);
996
997
0
        v_level = _mm_max_epi16 (v_level, v_min);
998
0
        v_level = _mm_min_epi16 (v_level, v_max);
999
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
1000
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
1001
1002
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
1003
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
1004
1005
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
1006
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
1007
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
1008
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
1009
0
      }
1010
0
    }
1011
0
    else
1012
0
    {
1013
0
      for( int y = 0; y <= maxY; y++)
1014
0
      {
1015
0
        for( int x = 0; x <= maxX; x+=8)
1016
0
        {
1017
0
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
1018
0
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
1019
0
          __m128i v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
1020
1021
0
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
1022
0
          v_level = _mm_and_si128(v_level,vlevmask);
1023
0
          v_level = _mm_max_epi16 (v_level, v_min);
1024
0
          v_level = _mm_min_epi16 (v_level, v_max);
1025
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
1026
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
1027
1028
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
1029
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
1030
1031
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
1032
1033
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
1034
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
1035
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
1036
1037
0
          v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4]);
1038
1039
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
1040
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
1041
1042
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
1043
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
1044
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
1045
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
1046
0
        }
1047
0
      }
1048
0
    }
1049
0
  }
1050
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingPCMCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int const*, int*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingPCMCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int const*, int*, unsigned long, int*, int, int, int)
1051
1052
1053
1054
1055
template<X86_VEXT vext>
1056
void Quant::_initQuantX86()
1057
0
{
1058
0
  DeQuant = DeQuantCoreSIMD<vext>;
1059
0
  DeQuantPCM = DeQuantCorePCMSIMD<vext>;
1060
0
  DeQuantScaling    = DeQuantScalingCoreSIMD<vext>;
1061
0
  DeQuantScalingPCM    = DeQuantScalingPCMCoreSIMD<vext>;
1062
1063
0
}
Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)4>()
1064
template void Quant::_initQuantX86<SIMDX86>();
1065
1066
#endif // TARGET_SIMD_X86
1067
#endif
1068
1069
}