Coverage Report

Created: 2026-05-24 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/QuantX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     QuantX86.h
44
    \brief    SIMD for Quant/Dequant
45
*/
46
47
#include "CommonLib/CommonDef.h"
48
#include "CommonDefX86.h"
49
#include "CommonLib/Quant.h"
50
51
namespace vvdec
52
{
53
54
#if ENABLE_SIMD_OPT_QUANT
55
#ifdef TARGET_SIMD_X86
56
57
#if USE_AVX2 && !defined( _mm256_set_m128i )
58
#define VVCLIB_OWN_mm256_set_m128i
59
29.9k
#define _mm256_set_m128i( v0, v1 ) _mm256_inserti128_si256( _mm256_castsi128_si256( v1 ), ( v0 ), 1 )
60
61
#endif
62
63
static constexpr unsigned short levmask[16] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0};
64
#if USE_AVX2
65
static constexpr unsigned short xlevmask[32] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
66
#endif
67
68
template< X86_VEXT vext>
69
static void DeQuantScalingCoreSIMD(const int maxX,const int restX,const int maxY,const int scaleQP,const int *piDequantCoef,const TCoeffSig*const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
70
0
{
71
0
  const int inputMinimum = -(inputMaximum+1);
72
0
  const TCoeff transformMinimum = -(transformMaximum+1);
73
0
  const int width = restX+maxX+1;
74
0
  __m128i vlevmask;
75
0
  if (maxX<7)
76
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
77
0
  else
78
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
79
#if USE_AVX2
80
  __m256i xvlevmask;
81
0
  if (maxX<15)
82
0
    xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] );
83
0
  else
84
0
    xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff);
85
#endif
86
0
  if (rightShift>0)
87
0
  {
88
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
89
90
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
91
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
92
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
93
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
94
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
95
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
96
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
97
0
    if (maxX<4)
98
0
    {
99
0
      for( int y = 0; y <= maxY; y++)
100
0
      {
101
0
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
102
0
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
103
0
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
104
105
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
106
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
107
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
108
109
0
        v_level = _mm_max_epi16 (v_level, v_min);
110
0
        v_level = _mm_min_epi16 (v_level, v_max);
111
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
112
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
113
114
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);  
115
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
116
0
        v_level =  _mm_add_epi32(v_level,v_add);
117
0
        v_level = _mm_sra_epi32(v_level,v_rshift);        
118
119
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
120
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
121
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
122
0
      }
123
0
    }
124
#if USE_AVX2
125
0
    else if (maxX<8)
126
0
    {
127
0
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
128
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
129
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
130
131
0
      for( int y = 0; y <= maxY; y++)
132
0
      {
133
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
134
0
        __m256i v_scale = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width] );
135
136
0
        v_level = _mm_max_epi16 (v_level, v_min);
137
0
        v_level = _mm_min_epi16 (v_level, v_max);
138
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
139
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
140
0
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
141
0
        xv_level = _mm256_mullo_epi32(xv_level,v_scale);
142
0
        xv_level =  _mm256_add_epi32(xv_level,xv_add);
143
0
        xv_level = _mm256_sra_epi32(xv_level,v_rshift);
144
0
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
145
0
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
146
0
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
147
0
      }
148
0
    }
149
0
    else
150
0
    {
151
0
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
152
0
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
153
0
      __m256i xv_scaleQP = _mm256_set1_epi16 ((short)scaleQP);
154
0
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
155
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
156
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
157
158
0
      __m256i v_scale_l,v_scale_h;
159
0
      for( int y = 0; y <= maxY; y++)
160
0
      {
161
0
        for( int x = 0; x <= maxX; x+=16)
162
0
        {
163
0
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
164
0
          v_scale_l = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x ]);
165
0
          v_scale_h = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x + 8 ]);
166
167
0
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
168
0
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
169
0
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
170
0
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scaleQP);
171
0
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scaleQP);
172
173
0
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
174
0
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
175
176
0
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
177
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_l);
178
179
0
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
180
0
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
181
182
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
183
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
184
185
0
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
186
187
0
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
188
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_h);
189
190
0
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
191
0
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
192
193
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
194
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
195
196
0
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
197
0
        }
198
0
      }
199
0
    }
200
#else
201
0
    else
202
0
    {        
203
0
      __m128i v_scale_l,v_scale_h;
204
0
      for( int y = 0; y <= maxY; y++)
205
0
      {
206
0
        for( int x = 0; x <= maxX; x+=8)
207
0
        {
208
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
209
210
          v_scale_l = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
211
          v_level = _mm_and_si128(v_level,vlevmask);
212
          v_level = _mm_max_epi16 (v_level, v_min);
213
          v_level = _mm_min_epi16 (v_level, v_max);
214
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
215
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
216
217
          v_level = _mm_unpacklo_epi16(v_low,v_high);
218
          v_level = _mm_mullo_epi32(v_level,v_scale_l);
219
          v_level =  _mm_add_epi32(v_level,v_add);
220
          v_level = _mm_sra_epi32(v_level,v_rshift);        
221
222
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
223
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
224
0
          _mm_storeu_si128(( __m128i * )(piCoef+y*width +x), v_level );
225
0
          if (maxX + 1 - x > 4)
226
0
          {
227
0
            v_scale_h = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4] );
228
0
            v_level = _mm_unpackhi_epi16(v_low,v_high);
229
0
            v_level = _mm_mullo_epi32(v_level,v_scale_h);
230
0
            v_level =  _mm_add_epi32(v_level,v_add);
231
0
            v_level = _mm_sra_epi32(v_level,v_rshift);
232
0
            v_level = _mm_max_epi32 (v_level, v_Tmin);
233
0
            v_level = _mm_min_epi32 (v_level, v_Tmax);
234
0
            _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
235
0
          }
236
0
        }
237
0
      }
238
0
    }
239
#endif
240
0
  }
241
0
  else  // rightshift <0
242
0
  {
243
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
244
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
245
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
246
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
247
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
248
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
249
0
    if (maxX<4)
250
0
    {
251
0
      for( int y = 0; y <= maxY; y++)
252
0
      {
253
0
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
254
0
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
255
0
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
256
257
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
258
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
259
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
260
0
        v_level = _mm_max_epi16 (v_level, v_min);
261
0
        v_level = _mm_min_epi16 (v_level, v_max);
262
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
263
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
264
265
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
266
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
267
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
268
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
269
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
270
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
271
0
      }
272
0
    }
273
#if USE_AVX2
274
0
    else if (maxX<8)
275
0
    {
276
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
277
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
278
0
      for( int y = 0; y <= maxY; y++)
279
0
      {
280
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
281
0
        __m256i v_scale = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width] );
282
283
        v_level = _mm_and_si128(v_level,vlevmask);
284
        v_level = _mm_max_epi16 (v_level, v_min);
285
        v_level = _mm_min_epi16 (v_level, v_max);
286
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
287
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
288
0
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
289
0
        xv_level = _mm256_mullo_epi32(xv_level,v_scale);
290
0
        xv_level = _mm256_sll_epi32(xv_level,v_lshift);
291
0
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
292
0
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
293
0
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
294
0
      }
295
0
    }
296
0
    else
297
0
    {
298
0
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
299
0
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
300
0
      __m256i xv_scaleQP = _mm256_set1_epi16 ((short)scaleQP);
301
0
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
302
0
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
303
0
      __m256i v_scale_l,v_scale_h;
304
305
0
      for( int y = 0; y <= maxY; y++)
306
0
      {
307
0
        for( int x = 0; x <= maxX; x+=16)
308
0
        {
309
0
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
310
0
          v_scale_l = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x ]);
311
0
          v_scale_h = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x + 8 ]);
312
313
0
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
314
0
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
315
0
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
316
0
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scaleQP);
317
0
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scaleQP);
318
319
0
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
320
0
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
321
322
0
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
323
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_l);
324
325
0
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
326
327
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
328
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
329
330
0
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
331
0
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
332
0
          xv_level = _mm256_mullo_epi32(xv_level,v_scale_h);
333
334
0
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
335
336
0
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
337
0
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
338
339
0
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
340
0
        }
341
0
      }
342
0
    }
343
#else
344
0
    else
345
0
    {
346
0
      __m128i v_scale_l,v_scale_h;
347
0
      for( int y = 0; y <= maxY; y++)
348
0
      {
349
0
        for( int x = 0; x <= maxX; x+=8)
350
0
        {
351
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
352
353
          v_scale_l = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
354
          v_level = _mm_and_si128(v_level,vlevmask);
355
          v_level = _mm_max_epi16 (v_level, v_min);
356
          v_level = _mm_min_epi16 (v_level, v_max);
357
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
358
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
359
360
          v_level = _mm_unpacklo_epi16(v_low,v_high);
361
          v_level = _mm_mullo_epi32(v_level,v_scale_l);
362
          v_level = _mm_sll_epi32(v_level,v_lshift);
363
          v_level = _mm_max_epi32 (v_level, v_Tmin);
364
          v_level = _mm_min_epi32 (v_level, v_Tmax);
365
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
366
367
0
          if (maxX + 1 - x > 4)
368
0
          {
369
0
            v_scale_h = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4] );
370
0
            v_level = _mm_unpackhi_epi16(v_low,v_high);
371
0
            v_level = _mm_mullo_epi32(v_level,v_scale_h);
372
0
            v_level = _mm_sll_epi32(v_level,v_lshift);
373
0
            v_level = _mm_max_epi32 (v_level, v_Tmin);
374
0
            v_level = _mm_min_epi32 (v_level, v_Tmax);
375
0
            _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
376
0
          }
377
0
        }
378
0
      }
379
0
    }
380
#endif
381
0
  }
382
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int)
383
384
template< X86_VEXT vext>
385
static void DeQuantCoreSIMD(const int maxX,const int restX,const int maxY,const int scale,const TCoeffSig*const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
386
26.4k
{
387
26.4k
  const int inputMinimum = -(inputMaximum+1);
388
26.4k
  const TCoeff transformMinimum = -(transformMaximum+1);
389
26.4k
  const int width = restX+maxX+1;
390
26.4k
  __m128i vlevmask;
391
26.4k
  if (maxX<7)
392
21.0k
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
393
5.35k
  else
394
5.35k
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
395
396
#if USE_AVX2
397
  __m256i xvlevmask;
398
26.4k
  if (maxX<15)
399
25.9k
    xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] );
400
528
  else
401
528
    xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff);
402
403
#endif
404
405
26.4k
  if (rightShift>0)
406
3.93k
  {
407
3.93k
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
408
409
3.93k
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
410
3.93k
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
411
3.93k
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
412
3.93k
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
413
3.93k
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
414
3.93k
    __m128i v_add = _mm_set1_epi32 (iAdd);
415
3.93k
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
416
417
3.93k
    if (maxX<4)
418
2.96k
    {
419
9.63k
      for( int y = 0; y <= maxY; y++)
420
6.66k
      {
421
6.66k
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
422
6.66k
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
423
2.18k
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
424
425
6.66k
        v_level = _mm_max_epi16 (v_level, v_min);
426
6.66k
        v_level = _mm_min_epi16 (v_level, v_max);
427
6.66k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
428
6.66k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
429
430
6.66k
        v_level = _mm_unpacklo_epi16(v_low,v_high);
431
6.66k
        v_level =  _mm_add_epi32(v_level,v_add);
432
6.66k
        v_level = _mm_sra_epi32(v_level,v_rshift);
433
434
6.66k
        v_level = _mm_max_epi32 (v_level, v_Tmin);
435
6.66k
        v_level = _mm_min_epi32 (v_level, v_Tmax);
436
6.66k
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
437
6.66k
      }
438
2.96k
    }
439
#if USE_AVX2
440
970
    else if (maxX<8)
441
648
    {
442
648
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
443
648
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
444
648
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
445
446
4.38k
      for( int y = 0; y <= maxY; y++)
447
3.74k
      {
448
3.74k
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
449
3.74k
        v_level = _mm_max_epi16 (v_level, v_min);
450
3.74k
        v_level = _mm_min_epi16 (v_level, v_max);
451
3.74k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
452
3.74k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
453
454
3.74k
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
455
456
3.74k
        xv_level =  _mm256_add_epi32(xv_level,xv_add);
457
3.74k
        xv_level = _mm256_sra_epi32(xv_level,v_rshift);
458
459
3.74k
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
460
3.74k
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
461
462
3.74k
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
463
3.74k
      }
464
648
    }
465
322
    else
466
322
    {
467
322
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
468
322
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
469
322
      __m256i xv_scale = _mm256_set1_epi16 ((short)scale);
470
322
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
471
322
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
472
322
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
473
474
3.81k
      for( int y = 0; y <= maxY; y++)
475
3.49k
      {
476
7.72k
        for( int x = 0; x <= maxX; x+=16)
477
4.22k
        {
478
4.22k
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
479
4.22k
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
480
4.22k
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
481
4.22k
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
482
4.22k
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale);
483
4.22k
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale);
484
485
4.22k
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
486
4.22k
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
487
488
489
4.22k
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
490
4.22k
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
491
4.22k
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
492
493
4.22k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
494
4.22k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
495
496
4.22k
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
497
4.22k
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
498
4.22k
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
499
4.22k
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
500
501
4.22k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
502
4.22k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
503
504
4.22k
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
505
4.22k
        }
506
3.49k
      }
507
508
322
    }
509
510
#else
511
/*
512
    else if (maxX<8)
513
    {
514
      for( int y = 0; y <= maxY; y++)
515
      {
516
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
517
        v_level = _mm_and_si128(v_level,vlevmask);
518
        v_level = _mm_max_epi16 (v_level, v_min);
519
        v_level = _mm_min_epi16 (v_level, v_max);
520
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
521
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
522
523
        v_level = _mm_unpacklo_epi16(v_low,v_high);
524
        v_level =  _mm_add_epi32(v_level,v_add);
525
        v_level = _mm_sra_epi32(v_level,v_rshift);
526
527
        v_level = _mm_max_epi32 (v_level, v_Tmin);
528
        v_level = _mm_min_epi32 (v_level, v_Tmax);
529
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
530
531
        v_level = _mm_unpackhi_epi16(v_low,v_high);
532
        v_level =  _mm_add_epi32(v_level,v_add);
533
        v_level = _mm_sra_epi32(v_level,v_rshift);
534
535
        v_level = _mm_max_epi32 (v_level, v_Tmin);
536
        v_level = _mm_min_epi32 (v_level, v_Tmax);
537
        _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level );
538
      }
539
    }
540
*/
541
0
    else
542
0
    {
543
0
      for( int y = 0; y <= maxY; y++)
544
0
      {
545
0
        for( int x = 0; x <= maxX; x+=8)
546
0
        {
547
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
548
0
          v_level = _mm_and_si128(v_level,vlevmask);
549
0
          v_level = _mm_max_epi16 (v_level, v_min);
550
0
          v_level = _mm_min_epi16 (v_level, v_max);
551
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
552
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
553
554
          v_level = _mm_unpacklo_epi16(v_low,v_high);
555
          v_level =  _mm_add_epi32(v_level,v_add);
556
          v_level = _mm_sra_epi32(v_level,v_rshift);
557
558
          v_level = _mm_max_epi32 (v_level, v_Tmin);
559
          v_level = _mm_min_epi32 (v_level, v_Tmax);
560
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
561
562
0
          if( maxX + 1 - x <= 4 ) continue;
563
564
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
565
0
          v_level =  _mm_add_epi32(v_level,v_add);
566
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
567
568
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
569
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
570
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
571
0
        }
572
0
      }
573
0
    }
574
#endif
575
3.93k
  }
576
22.4k
  else  // rightshift <0
577
22.4k
  {
578
579
22.4k
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
580
22.4k
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
581
22.4k
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
582
22.4k
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
583
22.4k
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
584
22.4k
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
585
586
22.4k
    if (maxX<4)
587
18.1k
    {
588
104k
      for( int y = 0; y <= maxY; y++)
589
86.0k
      {
590
86.0k
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
591
86.0k
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
592
5.91k
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
593
594
86.0k
        v_level = _mm_max_epi16 (v_level, v_min);
595
86.0k
        v_level = _mm_min_epi16 (v_level, v_max);
596
86.0k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
597
86.0k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
598
599
86.0k
        v_level = _mm_unpacklo_epi16(v_low,v_high);
600
86.0k
        v_level = _mm_sll_epi32(v_level,v_lshift);
601
602
86.0k
        v_level = _mm_max_epi32 (v_level, v_Tmin);
603
86.0k
        v_level = _mm_min_epi32 (v_level, v_Tmax);
604
86.0k
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
605
606
86.0k
      }
607
18.1k
    }
608
#if USE_AVX2
609
4.39k
    else if (maxX<8)
610
3.74k
    {
611
3.74k
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
612
3.74k
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
613
614
29.9k
      for( int y = 0; y <= maxY; y++)
615
26.2k
      {
616
26.2k
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
617
26.2k
        v_level = _mm_and_si128(v_level,vlevmask);
618
26.2k
        v_level = _mm_max_epi16 (v_level, v_min);
619
26.2k
        v_level = _mm_min_epi16 (v_level, v_max);
620
26.2k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
621
26.2k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
622
623
26.2k
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
624
625
26.2k
        xv_level = _mm256_sll_epi32(xv_level,v_lshift);
626
627
26.2k
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
628
26.2k
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
629
630
26.2k
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
631
26.2k
      }
632
3.74k
    }
633
634
652
    else
635
652
    {
636
652
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
637
652
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
638
652
      __m256i xv_scale = _mm256_set1_epi16 ((short)scale);
639
652
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
640
652
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
641
642
6.24k
      for( int y = 0; y <= maxY; y++)
643
5.59k
      {
644
12.9k
        for( int x = 0; x <= maxX; x+=16)
645
7.35k
        {
646
7.35k
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
647
7.35k
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
648
7.35k
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
649
7.35k
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
650
7.35k
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale);
651
7.35k
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale);
652
653
7.35k
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
654
7.35k
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
655
656
657
7.35k
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
658
7.35k
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
659
660
7.35k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
661
7.35k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
662
663
7.35k
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
664
7.35k
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
665
7.35k
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
666
667
7.35k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
668
7.35k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
669
670
7.35k
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
671
7.35k
        }
672
5.59k
      }
673
652
    }
674
675
#else
676
/*
677
    else if (maxX<8)
678
    {
679
      for( int y = 0; y <= maxY; y++)
680
      {
681
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
682
        v_level = _mm_and_si128(v_level,vlevmask);
683
        v_level = _mm_max_epi16 (v_level, v_min);
684
        v_level = _mm_min_epi16 (v_level, v_max);
685
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
686
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
687
688
        v_level = _mm_unpacklo_epi16(v_low,v_high);
689
        v_level = _mm_sll_epi32(v_level,v_lshift);
690
691
        v_level = _mm_max_epi32 (v_level, v_Tmin);
692
        v_level = _mm_min_epi32 (v_level, v_Tmax);
693
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
694
695
        v_level = _mm_unpackhi_epi16(v_low,v_high);
696
        v_level = _mm_sll_epi32(v_level,v_lshift);
697
698
        v_level = _mm_max_epi32 (v_level, v_Tmin);
699
        v_level = _mm_min_epi32 (v_level, v_Tmax);
700
        _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level );
701
      }
702
    }
703
*/
704
0
    else
705
0
    {
706
0
      for( int y = 0; y <= maxY; y++)
707
0
      {
708
0
        for( int x = 0; x <= maxX; x+=8)
709
0
        {
710
0
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
711
0
          v_level = _mm_and_si128(v_level,vlevmask);
712
0
          v_level = _mm_max_epi16 (v_level, v_min);
713
0
          v_level = _mm_min_epi16 (v_level, v_max);
714
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
715
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
716
717
          v_level = _mm_unpacklo_epi16(v_low,v_high);
718
          v_level = _mm_sll_epi32(v_level,v_lshift);
719
720
          v_level = _mm_max_epi32 (v_level, v_Tmin);
721
          v_level = _mm_min_epi32 (v_level, v_Tmax);
722
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
723
724
0
          if( maxX + 1 - x <= 4 ) continue;
725
726
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
727
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
728
729
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
730
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
731
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
732
0
        }
733
0
      }
734
0
    }
735
#endif
736
22.4k
  }
737
26.4k
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, short const*, unsigned long, int*, int, int, int)
Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, short const*, unsigned long, int*, int, int, int)
Line
Count
Source
386
26.4k
{
387
26.4k
  const int inputMinimum = -(inputMaximum+1);
388
26.4k
  const TCoeff transformMinimum = -(transformMaximum+1);
389
26.4k
  const int width = restX+maxX+1;
390
26.4k
  __m128i vlevmask;
391
26.4k
  if (maxX<7)
392
21.0k
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
393
5.35k
  else
394
5.35k
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
395
396
26.4k
#if USE_AVX2
397
26.4k
  __m256i xvlevmask;
398
26.4k
  if (maxX<15)
399
25.9k
    xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] );
400
528
  else
401
528
    xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff);
402
403
26.4k
#endif
404
405
26.4k
  if (rightShift>0)
406
3.93k
  {
407
3.93k
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
408
409
3.93k
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
410
3.93k
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
411
3.93k
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
412
3.93k
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
413
3.93k
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
414
3.93k
    __m128i v_add = _mm_set1_epi32 (iAdd);
415
3.93k
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
416
417
3.93k
    if (maxX<4)
418
2.96k
    {
419
9.63k
      for( int y = 0; y <= maxY; y++)
420
6.66k
      {
421
6.66k
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
422
6.66k
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
423
2.18k
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
424
425
6.66k
        v_level = _mm_max_epi16 (v_level, v_min);
426
6.66k
        v_level = _mm_min_epi16 (v_level, v_max);
427
6.66k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
428
6.66k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
429
430
6.66k
        v_level = _mm_unpacklo_epi16(v_low,v_high);
431
6.66k
        v_level =  _mm_add_epi32(v_level,v_add);
432
6.66k
        v_level = _mm_sra_epi32(v_level,v_rshift);
433
434
6.66k
        v_level = _mm_max_epi32 (v_level, v_Tmin);
435
6.66k
        v_level = _mm_min_epi32 (v_level, v_Tmax);
436
6.66k
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
437
6.66k
      }
438
2.96k
    }
439
970
#if USE_AVX2
440
970
    else if (maxX<8)
441
648
    {
442
648
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
443
648
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
444
648
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
445
446
4.38k
      for( int y = 0; y <= maxY; y++)
447
3.74k
      {
448
3.74k
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
449
3.74k
        v_level = _mm_max_epi16 (v_level, v_min);
450
3.74k
        v_level = _mm_min_epi16 (v_level, v_max);
451
3.74k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
452
3.74k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
453
454
3.74k
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
455
456
3.74k
        xv_level =  _mm256_add_epi32(xv_level,xv_add);
457
3.74k
        xv_level = _mm256_sra_epi32(xv_level,v_rshift);
458
459
3.74k
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
460
3.74k
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
461
462
3.74k
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
463
3.74k
      }
464
648
    }
465
322
    else
466
322
    {
467
322
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
468
322
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
469
322
      __m256i xv_scale = _mm256_set1_epi16 ((short)scale);
470
322
      __m256i xv_add = _mm256_set1_epi32 (iAdd);
471
322
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
472
322
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
473
474
3.81k
      for( int y = 0; y <= maxY; y++)
475
3.49k
      {
476
7.72k
        for( int x = 0; x <= maxX; x+=16)
477
4.22k
        {
478
4.22k
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
479
4.22k
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
480
4.22k
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
481
4.22k
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
482
4.22k
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale);
483
4.22k
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale);
484
485
4.22k
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
486
4.22k
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
487
488
489
4.22k
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
490
4.22k
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
491
4.22k
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
492
493
4.22k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
494
4.22k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
495
496
4.22k
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
497
4.22k
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
498
4.22k
          xv_level =  _mm256_add_epi32(xv_level,xv_add);
499
4.22k
          xv_level = _mm256_sra_epi32(xv_level,v_rshift);
500
501
4.22k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
502
4.22k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
503
504
4.22k
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
505
4.22k
        }
506
3.49k
      }
507
508
322
    }
509
510
#else
511
/*
512
    else if (maxX<8)
513
    {
514
      for( int y = 0; y <= maxY; y++)
515
      {
516
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
517
        v_level = _mm_and_si128(v_level,vlevmask);
518
        v_level = _mm_max_epi16 (v_level, v_min);
519
        v_level = _mm_min_epi16 (v_level, v_max);
520
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
521
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
522
523
        v_level = _mm_unpacklo_epi16(v_low,v_high);
524
        v_level =  _mm_add_epi32(v_level,v_add);
525
        v_level = _mm_sra_epi32(v_level,v_rshift);
526
527
        v_level = _mm_max_epi32 (v_level, v_Tmin);
528
        v_level = _mm_min_epi32 (v_level, v_Tmax);
529
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
530
531
        v_level = _mm_unpackhi_epi16(v_low,v_high);
532
        v_level =  _mm_add_epi32(v_level,v_add);
533
        v_level = _mm_sra_epi32(v_level,v_rshift);
534
535
        v_level = _mm_max_epi32 (v_level, v_Tmin);
536
        v_level = _mm_min_epi32 (v_level, v_Tmax);
537
        _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level );
538
      }
539
    }
540
*/
541
    else
542
    {
543
      for( int y = 0; y <= maxY; y++)
544
      {
545
        for( int x = 0; x <= maxX; x+=8)
546
        {
547
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
548
          v_level = _mm_and_si128(v_level,vlevmask);
549
          v_level = _mm_max_epi16 (v_level, v_min);
550
          v_level = _mm_min_epi16 (v_level, v_max);
551
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
552
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
553
554
          v_level = _mm_unpacklo_epi16(v_low,v_high);
555
          v_level =  _mm_add_epi32(v_level,v_add);
556
          v_level = _mm_sra_epi32(v_level,v_rshift);
557
558
          v_level = _mm_max_epi32 (v_level, v_Tmin);
559
          v_level = _mm_min_epi32 (v_level, v_Tmax);
560
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
561
562
          if( maxX + 1 - x <= 4 ) continue;
563
564
          v_level = _mm_unpackhi_epi16(v_low,v_high);
565
          v_level =  _mm_add_epi32(v_level,v_add);
566
          v_level = _mm_sra_epi32(v_level,v_rshift);
567
568
          v_level = _mm_max_epi32 (v_level, v_Tmin);
569
          v_level = _mm_min_epi32 (v_level, v_Tmax);
570
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
571
        }
572
      }
573
    }
574
#endif
575
3.93k
  }
576
22.4k
  else  // rightshift <0
577
22.4k
  {
578
579
22.4k
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
580
22.4k
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
581
22.4k
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
582
22.4k
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
583
22.4k
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
584
22.4k
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
585
586
22.4k
    if (maxX<4)
587
18.1k
    {
588
104k
      for( int y = 0; y <= maxY; y++)
589
86.0k
      {
590
86.0k
        __m128i v_level = maxX  > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] )
591
86.0k
                        : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 )
592
5.91k
                                    : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 );
593
594
86.0k
        v_level = _mm_max_epi16 (v_level, v_min);
595
86.0k
        v_level = _mm_min_epi16 (v_level, v_max);
596
86.0k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
597
86.0k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
598
599
86.0k
        v_level = _mm_unpacklo_epi16(v_low,v_high);
600
86.0k
        v_level = _mm_sll_epi32(v_level,v_lshift);
601
602
86.0k
        v_level = _mm_max_epi32 (v_level, v_Tmin);
603
86.0k
        v_level = _mm_min_epi32 (v_level, v_Tmax);
604
86.0k
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
605
606
86.0k
      }
607
18.1k
    }
608
4.39k
#if USE_AVX2
609
4.39k
    else if (maxX<8)
610
3.74k
    {
611
3.74k
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
612
3.74k
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
613
614
29.9k
      for( int y = 0; y <= maxY; y++)
615
26.2k
      {
616
26.2k
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
617
26.2k
        v_level = _mm_and_si128(v_level,vlevmask);
618
26.2k
        v_level = _mm_max_epi16 (v_level, v_min);
619
26.2k
        v_level = _mm_min_epi16 (v_level, v_max);
620
26.2k
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
621
26.2k
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
622
623
26.2k
        __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high));
624
625
26.2k
        xv_level = _mm256_sll_epi32(xv_level,v_lshift);
626
627
26.2k
        xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
628
26.2k
        xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
629
630
26.2k
        _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level );
631
26.2k
      }
632
3.74k
    }
633
634
652
    else
635
652
    {
636
652
      __m256i xv_max =  _mm256_set1_epi16 ((short)inputMaximum);
637
652
      __m256i xv_min =  _mm256_set1_epi16 ((short)inputMinimum);
638
652
      __m256i xv_scale = _mm256_set1_epi16 ((short)scale);
639
652
      __m256i xv_Tmax =  _mm256_set1_epi32 ((short)transformMaximum);
640
652
      __m256i xv_Tmin =  _mm256_set1_epi32 ((short)transformMinimum);
641
642
6.24k
      for( int y = 0; y <= maxY; y++)
643
5.59k
      {
644
12.9k
        for( int x = 0; x <= maxX; x+=16)
645
7.35k
        {
646
7.35k
          __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride]  );
647
7.35k
          xv_level = _mm256_and_si256(xv_level,xvlevmask);
648
7.35k
          xv_level = _mm256_max_epi16 (xv_level, xv_min);
649
7.35k
          xv_level = _mm256_min_epi16 (xv_level, xv_max);
650
7.35k
          __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale);
651
7.35k
          __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale);
652
653
7.35k
          xv_low = _mm256_permute4x64_epi64(xv_low,0xD8);
654
7.35k
          xv_high = _mm256_permute4x64_epi64(xv_high,0xD8);
655
656
657
7.35k
          xv_level = _mm256_unpacklo_epi16(xv_low,xv_high);
658
7.35k
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
659
660
7.35k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
661
7.35k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
662
663
7.35k
          _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level );
664
7.35k
          xv_level = _mm256_unpackhi_epi16(xv_low,xv_high);
665
7.35k
          xv_level = _mm256_sll_epi32(xv_level,v_lshift);
666
667
7.35k
          xv_level = _mm256_max_epi32 (xv_level, xv_Tmin);
668
7.35k
          xv_level = _mm256_min_epi32 (xv_level, xv_Tmax);
669
670
7.35k
          _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level );
671
7.35k
        }
672
5.59k
      }
673
652
    }
674
675
#else
676
/*
677
    else if (maxX<8)
678
    {
679
      for( int y = 0; y <= maxY; y++)
680
      {
681
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
682
        v_level = _mm_and_si128(v_level,vlevmask);
683
        v_level = _mm_max_epi16 (v_level, v_min);
684
        v_level = _mm_min_epi16 (v_level, v_max);
685
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
686
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
687
688
        v_level = _mm_unpacklo_epi16(v_low,v_high);
689
        v_level = _mm_sll_epi32(v_level,v_lshift);
690
691
        v_level = _mm_max_epi32 (v_level, v_Tmin);
692
        v_level = _mm_min_epi32 (v_level, v_Tmax);
693
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
694
695
        v_level = _mm_unpackhi_epi16(v_low,v_high);
696
        v_level = _mm_sll_epi32(v_level,v_lshift);
697
698
        v_level = _mm_max_epi32 (v_level, v_Tmin);
699
        v_level = _mm_min_epi32 (v_level, v_Tmax);
700
        _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level );
701
      }
702
    }
703
*/
704
    else
705
    {
706
      for( int y = 0; y <= maxY; y++)
707
      {
708
        for( int x = 0; x <= maxX; x+=8)
709
        {
710
          __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
711
          v_level = _mm_and_si128(v_level,vlevmask);
712
          v_level = _mm_max_epi16 (v_level, v_min);
713
          v_level = _mm_min_epi16 (v_level, v_max);
714
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
715
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
716
717
          v_level = _mm_unpacklo_epi16(v_low,v_high);
718
          v_level = _mm_sll_epi32(v_level,v_lshift);
719
720
          v_level = _mm_max_epi32 (v_level, v_Tmin);
721
          v_level = _mm_min_epi32 (v_level, v_Tmax);
722
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
723
724
          if( maxX + 1 - x <= 4 ) continue;
725
726
          v_level = _mm_unpackhi_epi16(v_low,v_high);
727
          v_level = _mm_sll_epi32(v_level,v_lshift);
728
729
          v_level = _mm_max_epi32 (v_level, v_Tmin);
730
          v_level = _mm_min_epi32 (v_level, v_Tmax);
731
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
732
        }
733
      }
734
    }
735
#endif
736
22.4k
  }
737
26.4k
}
738
739
template< X86_VEXT vext>
740
static void DeQuantCorePCMSIMD(const int maxX,const int restX,const int maxY,const int scale,TCoeff   *const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
741
379
{
742
379
  const int inputMinimum = -(inputMaximum+1);
743
379
  const TCoeff transformMinimum = -(transformMaximum+1);
744
379
  const int width = restX+maxX+1;
745
379
  __m128i vlevmask;
746
379
  if (maxX<7)
747
55
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
748
324
  else
749
324
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
750
751
379
  if (rightShift>0)
752
354
  {
753
354
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
754
755
354
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
756
354
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
757
354
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
758
354
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
759
354
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
760
354
    __m128i v_add = _mm_set1_epi32 (iAdd);
761
354
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
762
763
354
    if (maxX<4)
764
51
    {
765
527
      for( int y = 0; y <= maxY; y++)
766
476
      {
767
476
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
768
476
        v_level = _mm_packs_epi32 (v_level,v_level);
769
476
        v_level = _mm_and_si128(v_level,vlevmask);
770
476
        v_level = _mm_max_epi16 (v_level, v_min);
771
476
        v_level = _mm_min_epi16 (v_level, v_max);
772
476
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
773
476
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
774
775
476
        v_level = _mm_unpacklo_epi16(v_low,v_high);
776
476
        v_level =  _mm_add_epi32(v_level,v_add);
777
476
        v_level = _mm_sra_epi32(v_level,v_rshift);
778
779
476
        v_level = _mm_max_epi32 (v_level, v_Tmin);
780
476
        v_level = _mm_min_epi32 (v_level, v_Tmax);
781
476
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
782
476
      }
783
51
    }
784
303
    else
785
303
    {
786
3.84k
      for( int y = 0; y <= maxY; y++)
787
3.54k
      {
788
9.33k
        for( int x = 0; x <= maxX; x+=8)
789
5.78k
        {
790
791
5.78k
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
792
5.78k
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
793
5.78k
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
794
5.78k
          v_level = _mm_and_si128(v_level,vlevmask);
795
5.78k
          v_level = _mm_max_epi16 (v_level, v_min);
796
5.78k
          v_level = _mm_min_epi16 (v_level, v_max);
797
5.78k
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
798
5.78k
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
799
800
5.78k
          v_level = _mm_unpacklo_epi16(v_low,v_high);
801
5.78k
          v_level =  _mm_add_epi32(v_level,v_add);
802
5.78k
          v_level = _mm_sra_epi32(v_level,v_rshift);
803
804
5.78k
          v_level = _mm_max_epi32 (v_level, v_Tmin);
805
5.78k
          v_level = _mm_min_epi32 (v_level, v_Tmax);
806
5.78k
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
807
808
5.78k
          v_level = _mm_unpackhi_epi16(v_low,v_high);
809
5.78k
          v_level =  _mm_add_epi32(v_level,v_add);
810
5.78k
          v_level = _mm_sra_epi32(v_level,v_rshift);
811
812
5.78k
          v_level = _mm_max_epi32 (v_level, v_Tmin);
813
5.78k
          v_level = _mm_min_epi32 (v_level, v_Tmax);
814
5.78k
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
815
5.78k
        }
816
3.54k
      }
817
303
    }
818
354
  }
819
25
  else  // rightshift <0
820
25
  {
821
822
25
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
823
25
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
824
25
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
825
25
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
826
25
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
827
25
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
828
829
25
    if (maxX<4)
830
4
    {
831
32
      for( int y = 0; y <= maxY; y++)
832
28
      {
833
28
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
834
28
        v_level = _mm_packs_epi32 (v_level,v_level);
835
28
        v_level = _mm_and_si128(v_level,vlevmask);
836
837
28
        v_level = _mm_max_epi16 (v_level, v_min);
838
28
        v_level = _mm_min_epi16 (v_level, v_max);
839
28
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
840
28
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
841
842
28
        v_level = _mm_unpacklo_epi16(v_low,v_high);
843
28
        v_level = _mm_sll_epi32(v_level,v_lshift);
844
845
28
        v_level = _mm_max_epi32 (v_level, v_Tmin);
846
28
        v_level = _mm_min_epi32 (v_level, v_Tmax);
847
28
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
848
28
      }
849
4
    }
850
21
    else
851
21
    {
852
209
      for( int y = 0; y <= maxY; y++)
853
188
      {
854
448
        for( int x = 0; x <= maxX; x+=8)
855
260
        {
856
260
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
857
260
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
858
260
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
859
260
          v_level = _mm_and_si128(v_level,vlevmask);
860
260
          v_level = _mm_max_epi16 (v_level, v_min);
861
260
          v_level = _mm_min_epi16 (v_level, v_max);
862
260
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
863
260
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
864
865
260
          v_level = _mm_unpacklo_epi16(v_low,v_high);
866
260
          v_level = _mm_sll_epi32(v_level,v_lshift);
867
868
260
          v_level = _mm_max_epi32 (v_level, v_Tmin);
869
260
          v_level = _mm_min_epi32 (v_level, v_Tmax);
870
260
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
871
872
260
          v_level = _mm_unpackhi_epi16(v_low,v_high);
873
260
          v_level = _mm_sll_epi32(v_level,v_lshift);
874
875
260
          v_level = _mm_max_epi32 (v_level, v_Tmin);
876
260
          v_level = _mm_min_epi32 (v_level, v_Tmax);
877
260
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
878
260
        }
879
188
      }
880
21
    }
881
25
  }
882
379
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCorePCMSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int*, unsigned long, int*, int, int, int)
Quant_avx2.cpp:void vvdec::DeQuantCorePCMSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int*, unsigned long, int*, int, int, int)
Line
Count
Source
741
379
{
742
379
  const int inputMinimum = -(inputMaximum+1);
743
379
  const TCoeff transformMinimum = -(transformMaximum+1);
744
379
  const int width = restX+maxX+1;
745
379
  __m128i vlevmask;
746
379
  if (maxX<7)
747
55
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
748
324
  else
749
324
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
750
751
379
  if (rightShift>0)
752
354
  {
753
354
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
754
755
354
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
756
354
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
757
354
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
758
354
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
759
354
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
760
354
    __m128i v_add = _mm_set1_epi32 (iAdd);
761
354
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
762
763
354
    if (maxX<4)
764
51
    {
765
527
      for( int y = 0; y <= maxY; y++)
766
476
      {
767
476
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
768
476
        v_level = _mm_packs_epi32 (v_level,v_level);
769
476
        v_level = _mm_and_si128(v_level,vlevmask);
770
476
        v_level = _mm_max_epi16 (v_level, v_min);
771
476
        v_level = _mm_min_epi16 (v_level, v_max);
772
476
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
773
476
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
774
775
476
        v_level = _mm_unpacklo_epi16(v_low,v_high);
776
476
        v_level =  _mm_add_epi32(v_level,v_add);
777
476
        v_level = _mm_sra_epi32(v_level,v_rshift);
778
779
476
        v_level = _mm_max_epi32 (v_level, v_Tmin);
780
476
        v_level = _mm_min_epi32 (v_level, v_Tmax);
781
476
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
782
476
      }
783
51
    }
784
303
    else
785
303
    {
786
3.84k
      for( int y = 0; y <= maxY; y++)
787
3.54k
      {
788
9.33k
        for( int x = 0; x <= maxX; x+=8)
789
5.78k
        {
790
791
5.78k
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
792
5.78k
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
793
5.78k
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
794
5.78k
          v_level = _mm_and_si128(v_level,vlevmask);
795
5.78k
          v_level = _mm_max_epi16 (v_level, v_min);
796
5.78k
          v_level = _mm_min_epi16 (v_level, v_max);
797
5.78k
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
798
5.78k
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
799
800
5.78k
          v_level = _mm_unpacklo_epi16(v_low,v_high);
801
5.78k
          v_level =  _mm_add_epi32(v_level,v_add);
802
5.78k
          v_level = _mm_sra_epi32(v_level,v_rshift);
803
804
5.78k
          v_level = _mm_max_epi32 (v_level, v_Tmin);
805
5.78k
          v_level = _mm_min_epi32 (v_level, v_Tmax);
806
5.78k
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
807
808
5.78k
          v_level = _mm_unpackhi_epi16(v_low,v_high);
809
5.78k
          v_level =  _mm_add_epi32(v_level,v_add);
810
5.78k
          v_level = _mm_sra_epi32(v_level,v_rshift);
811
812
5.78k
          v_level = _mm_max_epi32 (v_level, v_Tmin);
813
5.78k
          v_level = _mm_min_epi32 (v_level, v_Tmax);
814
5.78k
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
815
5.78k
        }
816
3.54k
      }
817
303
    }
818
354
  }
819
25
  else  // rightshift <0
820
25
  {
821
822
25
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
823
25
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
824
25
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
825
25
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
826
25
    __m128i v_scale = _mm_set1_epi16 ((short)scale);
827
25
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
828
829
25
    if (maxX<4)
830
4
    {
831
32
      for( int y = 0; y <= maxY; y++)
832
28
      {
833
28
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
834
28
        v_level = _mm_packs_epi32 (v_level,v_level);
835
28
        v_level = _mm_and_si128(v_level,vlevmask);
836
837
28
        v_level = _mm_max_epi16 (v_level, v_min);
838
28
        v_level = _mm_min_epi16 (v_level, v_max);
839
28
        __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
840
28
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
841
842
28
        v_level = _mm_unpacklo_epi16(v_low,v_high);
843
28
        v_level = _mm_sll_epi32(v_level,v_lshift);
844
845
28
        v_level = _mm_max_epi32 (v_level, v_Tmin);
846
28
        v_level = _mm_min_epi32 (v_level, v_Tmax);
847
28
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
848
28
      }
849
4
    }
850
21
    else
851
21
    {
852
209
      for( int y = 0; y <= maxY; y++)
853
188
      {
854
448
        for( int x = 0; x <= maxX; x+=8)
855
260
        {
856
260
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
857
260
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
858
260
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
859
260
          v_level = _mm_and_si128(v_level,vlevmask);
860
260
          v_level = _mm_max_epi16 (v_level, v_min);
861
260
          v_level = _mm_min_epi16 (v_level, v_max);
862
260
          __m128i v_low = _mm_mullo_epi16(v_level,v_scale);
863
260
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scale);
864
865
260
          v_level = _mm_unpacklo_epi16(v_low,v_high);
866
260
          v_level = _mm_sll_epi32(v_level,v_lshift);
867
868
260
          v_level = _mm_max_epi32 (v_level, v_Tmin);
869
260
          v_level = _mm_min_epi32 (v_level, v_Tmax);
870
260
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
871
872
260
          v_level = _mm_unpackhi_epi16(v_low,v_high);
873
260
          v_level = _mm_sll_epi32(v_level,v_lshift);
874
875
260
          v_level = _mm_max_epi32 (v_level, v_Tmin);
876
260
          v_level = _mm_min_epi32 (v_level, v_Tmax);
877
260
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
878
260
        }
879
188
      }
880
21
    }
881
25
  }
882
379
}
883
884
template< X86_VEXT vext>
885
static void DeQuantScalingPCMCoreSIMD(const int maxX,const int restX,const int maxY,const int scaleQP,const int     *piDequantCoef,TCoeff   *const piQCoef,const size_t piQCfStride,TCoeff   *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum)
886
0
{
887
0
  const int inputMinimum = -(inputMaximum+1);
888
0
  const TCoeff transformMinimum = -(transformMaximum+1);
889
0
  const int width = restX+maxX+1;
890
0
  __m128i vlevmask;
891
0
  if (maxX<7)
892
0
    vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] );
893
0
  else
894
0
    vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff);
895
896
0
  if (rightShift>0)
897
0
  {
898
0
    const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1);
899
900
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
901
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
902
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
903
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
904
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
905
0
    __m128i v_add = _mm_set1_epi32 (iAdd);
906
0
    __m128i v_rshift = _mm_set1_epi64x (rightShift);
907
908
0
    if (maxX<4)
909
0
    {
910
0
      for( int y = 0; y <= maxY; y++)
911
0
      {
912
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
913
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
914
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
915
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
916
917
0
        v_level = _mm_packs_epi32 (v_level,v_level);
918
0
        v_level = _mm_and_si128(v_level,vlevmask);
919
0
        v_level = _mm_max_epi16 (v_level, v_min);
920
0
        v_level = _mm_min_epi16 (v_level, v_max);
921
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
922
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
923
924
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
925
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
926
927
0
        v_level =  _mm_add_epi32(v_level,v_add);
928
0
        v_level = _mm_sra_epi32(v_level,v_rshift);
929
930
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
931
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
932
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
933
0
      }
934
0
    }
935
0
    else
936
0
    {
937
0
      for( int y = 0; y <= maxY; y++)
938
0
      {
939
0
        for( int x = 0; x <= maxX; x+=8)
940
0
        {
941
0
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
942
0
          __m128i v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
943
0
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
944
0
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
945
946
0
          v_level = _mm_and_si128(v_level,vlevmask);
947
0
          v_level = _mm_max_epi16 (v_level, v_min);
948
0
          v_level = _mm_min_epi16 (v_level, v_max);
949
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
950
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
951
952
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
953
954
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
955
956
0
          v_level =  _mm_add_epi32(v_level,v_add);
957
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
958
959
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
960
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
961
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
962
963
0
          v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4]);
964
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
965
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
966
0
          v_level =  _mm_add_epi32(v_level,v_add);
967
0
          v_level = _mm_sra_epi32(v_level,v_rshift);
968
969
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
970
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
971
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
972
0
        }
973
0
      }
974
0
    }
975
0
  }
976
0
  else  // rightshift <0
977
0
  {
978
0
    __m128i v_max =  _mm_set1_epi16 ((short)inputMaximum);
979
0
    __m128i v_min =  _mm_set1_epi16 ((short)inputMinimum);
980
0
    __m128i v_Tmax =  _mm_set1_epi32 ((short)transformMaximum);
981
0
    __m128i v_Tmin =  _mm_set1_epi32 ((short)transformMinimum);
982
0
    __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP);
983
0
    __m128i v_lshift = _mm_set1_epi64x (-rightShift);
984
985
0
    if (maxX<4)
986
0
    {
987
0
      for( int y = 0; y <= maxY; y++)
988
0
      {
989
0
        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
990
0
        __m128i v_scale = maxX  > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] )
991
0
                                : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 )
992
0
                                            : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 );
993
994
0
        v_level = _mm_packs_epi32 (v_level,v_level);
995
0
        v_level = _mm_and_si128(v_level,vlevmask);
996
997
0
        v_level = _mm_max_epi16 (v_level, v_min);
998
0
        v_level = _mm_min_epi16 (v_level, v_max);
999
0
        __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
1000
0
        __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
1001
1002
0
        v_level = _mm_unpacklo_epi16(v_low,v_high);
1003
0
        v_level = _mm_mullo_epi32(v_level,v_scale);
1004
1005
0
        v_level = _mm_sll_epi32(v_level,v_lshift);
1006
0
        v_level = _mm_max_epi32 (v_level, v_Tmin);
1007
0
        v_level = _mm_min_epi32 (v_level, v_Tmax);
1008
0
        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
1009
0
      }
1010
0
    }
1011
0
    else
1012
0
    {
1013
0
      for( int y = 0; y <= maxY; y++)
1014
0
      {
1015
0
        for( int x = 0; x <= maxX; x+=8)
1016
0
        {
1017
0
          __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride]  );
1018
0
          __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride]  );
1019
0
          __m128i v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] );
1020
1021
0
          __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh);
1022
0
          v_level = _mm_and_si128(v_level,vlevmask);
1023
0
          v_level = _mm_max_epi16 (v_level, v_min);
1024
0
          v_level = _mm_min_epi16 (v_level, v_max);
1025
0
          __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP);
1026
0
          __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP);
1027
1028
0
          v_level = _mm_unpacklo_epi16(v_low,v_high);
1029
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
1030
1031
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
1032
1033
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
1034
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
1035
0
          _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level );
1036
1037
0
          v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4]);
1038
1039
0
          v_level = _mm_unpackhi_epi16(v_low,v_high);
1040
0
          v_level = _mm_mullo_epi32(v_level,v_scale);
1041
1042
0
          v_level = _mm_sll_epi32(v_level,v_lshift);
1043
0
          v_level = _mm_max_epi32 (v_level, v_Tmin);
1044
0
          v_level = _mm_min_epi32 (v_level, v_Tmax);
1045
0
          _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level );
1046
0
        }
1047
0
      }
1048
0
    }
1049
0
  }
1050
0
}
Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingPCMCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int const*, int*, unsigned long, int*, int, int, int)
Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingPCMCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int const*, int*, unsigned long, int*, int, int, int)
1051
1052
1053
1054
1055
template<X86_VEXT vext>
1056
void Quant::_initQuantX86()
1057
25.8k
{
1058
25.8k
  DeQuant = DeQuantCoreSIMD<vext>;
1059
25.8k
  DeQuantPCM = DeQuantCorePCMSIMD<vext>;
1060
25.8k
  DeQuantScaling    = DeQuantScalingCoreSIMD<vext>;
1061
25.8k
  DeQuantScalingPCM    = DeQuantScalingPCMCoreSIMD<vext>;
1062
1063
25.8k
}
Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
1057
25.8k
{
1058
25.8k
  DeQuant = DeQuantCoreSIMD<vext>;
1059
25.8k
  DeQuantPCM = DeQuantCorePCMSIMD<vext>;
1060
25.8k
  DeQuantScaling    = DeQuantScalingCoreSIMD<vext>;
1061
25.8k
  DeQuantScalingPCM    = DeQuantScalingPCMCoreSIMD<vext>;
1062
1063
25.8k
}
1064
template void Quant::_initQuantX86<SIMDX86>();
1065
1066
#endif // TARGET_SIMD_X86
1067
#endif
1068
1069
}