/src/vvdec/source/Lib/CommonLib/x86/QuantX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | /** \file QuantX86.h |
44 | | \brief SIMD for Quant/Dequant |
45 | | */ |
46 | | |
47 | | #include "CommonLib/CommonDef.h" |
48 | | #include "CommonDefX86.h" |
49 | | #include "CommonLib/Quant.h" |
50 | | |
51 | | namespace vvdec |
52 | | { |
53 | | |
54 | | #if ENABLE_SIMD_OPT_QUANT |
55 | | #ifdef TARGET_SIMD_X86 |
56 | | |
57 | | #if USE_AVX2 && !defined( _mm256_set_m128i ) |
58 | | #define VVCLIB_OWN_mm256_set_m128i |
59 | 0 | #define _mm256_set_m128i( v0, v1 ) _mm256_inserti128_si256( _mm256_castsi128_si256( v1 ), ( v0 ), 1 ) |
60 | | |
61 | | #endif |
62 | | |
63 | | static constexpr unsigned short levmask[16] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0}; |
64 | | #if USE_AVX2 |
65 | | static constexpr unsigned short xlevmask[32] = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; |
66 | | #endif |
67 | | |
68 | | template< X86_VEXT vext> |
69 | | static void DeQuantScalingCoreSIMD(const int maxX,const int restX,const int maxY,const int scaleQP,const int *piDequantCoef,const TCoeffSig*const piQCoef,const size_t piQCfStride,TCoeff *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum) |
70 | 0 | { |
71 | 0 | const int inputMinimum = -(inputMaximum+1); |
72 | 0 | const TCoeff transformMinimum = -(transformMaximum+1); |
73 | 0 | const int width = restX+maxX+1; |
74 | 0 | __m128i vlevmask; |
75 | 0 | if (maxX<7) |
76 | 0 | vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] ); |
77 | 0 | else |
78 | 0 | vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff); |
79 | | #if USE_AVX2 |
80 | | __m256i xvlevmask; |
81 | 0 | if (maxX<15) |
82 | 0 | xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] ); |
83 | 0 | else |
84 | 0 | xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff); |
85 | | #endif |
86 | 0 | if (rightShift>0) |
87 | 0 | { |
88 | 0 | const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1); |
89 | |
|
90 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
91 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
92 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
93 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
94 | 0 | __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP); |
95 | 0 | __m128i v_add = _mm_set1_epi32 (iAdd); |
96 | 0 | __m128i v_rshift = _mm_set1_epi64x (rightShift); |
97 | 0 | if (maxX<4) |
98 | 0 | { |
99 | 0 | for( int y = 0; y <= maxY; y++) |
100 | 0 | { |
101 | 0 | __m128i v_level = maxX > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] ) |
102 | 0 | : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 ) |
103 | 0 | : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 ); |
104 | |
|
105 | 0 | __m128i v_scale = maxX > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] ) |
106 | 0 | : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 ) |
107 | 0 | : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 ); |
108 | |
|
109 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
110 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
111 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
112 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
113 | |
|
114 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
115 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
116 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
117 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
118 | |
|
119 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
120 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
121 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
122 | 0 | } |
123 | 0 | } |
124 | | #if USE_AVX2 |
125 | 0 | else if (maxX<8) |
126 | 0 | { |
127 | 0 | __m256i xv_add = _mm256_set1_epi32 (iAdd); |
128 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
129 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
130 | | |
131 | 0 | for( int y = 0; y <= maxY; y++) |
132 | 0 | { |
133 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
134 | 0 | __m256i v_scale = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width] ); |
135 | |
|
136 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
137 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
138 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
139 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
140 | 0 | __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high)); |
141 | 0 | xv_level = _mm256_mullo_epi32(xv_level,v_scale); |
142 | 0 | xv_level = _mm256_add_epi32(xv_level,xv_add); |
143 | 0 | xv_level = _mm256_sra_epi32(xv_level,v_rshift); |
144 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
145 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
146 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level ); |
147 | 0 | } |
148 | 0 | } |
149 | 0 | else |
150 | 0 | { |
151 | 0 | __m256i xv_max = _mm256_set1_epi16 ((short)inputMaximum); |
152 | 0 | __m256i xv_min = _mm256_set1_epi16 ((short)inputMinimum); |
153 | 0 | __m256i xv_scaleQP = _mm256_set1_epi16 ((short)scaleQP); |
154 | 0 | __m256i xv_add = _mm256_set1_epi32 (iAdd); |
155 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
156 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
157 | |
|
158 | 0 | __m256i v_scale_l,v_scale_h; |
159 | 0 | for( int y = 0; y <= maxY; y++) |
160 | 0 | { |
161 | 0 | for( int x = 0; x <= maxX; x+=16) |
162 | 0 | { |
163 | 0 | __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride] ); |
164 | 0 | v_scale_l = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x ]); |
165 | 0 | v_scale_h = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x + 8 ]); |
166 | |
|
167 | 0 | xv_level = _mm256_and_si256(xv_level,xvlevmask); |
168 | 0 | xv_level = _mm256_max_epi16 (xv_level, xv_min); |
169 | 0 | xv_level = _mm256_min_epi16 (xv_level, xv_max); |
170 | 0 | __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scaleQP); |
171 | 0 | __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scaleQP); |
172 | |
|
173 | 0 | xv_low = _mm256_permute4x64_epi64(xv_low,0xD8); |
174 | 0 | xv_high = _mm256_permute4x64_epi64(xv_high,0xD8); |
175 | |
|
176 | 0 | xv_level = _mm256_unpacklo_epi16(xv_low,xv_high); |
177 | 0 | xv_level = _mm256_mullo_epi32(xv_level,v_scale_l); |
178 | |
|
179 | 0 | xv_level = _mm256_add_epi32(xv_level,xv_add); |
180 | 0 | xv_level = _mm256_sra_epi32(xv_level,v_rshift); |
181 | |
|
182 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
183 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
184 | |
|
185 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level ); |
186 | |
|
187 | 0 | xv_level = _mm256_unpackhi_epi16(xv_low,xv_high); |
188 | 0 | xv_level = _mm256_mullo_epi32(xv_level,v_scale_h); |
189 | |
|
190 | 0 | xv_level = _mm256_add_epi32(xv_level,xv_add); |
191 | 0 | xv_level = _mm256_sra_epi32(xv_level,v_rshift); |
192 | |
|
193 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
194 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
195 | |
|
196 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level ); |
197 | 0 | } |
198 | 0 | } |
199 | 0 | } |
200 | | #else |
201 | 0 | else |
202 | 0 | { |
203 | 0 | __m128i v_scale_l,v_scale_h; |
204 | 0 | for( int y = 0; y <= maxY; y++) |
205 | 0 | { |
206 | 0 | for( int x = 0; x <= maxX; x+=8) |
207 | 0 | { |
208 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
209 | | |
210 | | v_scale_l = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] ); |
211 | | v_level = _mm_and_si128(v_level,vlevmask); |
212 | | v_level = _mm_max_epi16 (v_level, v_min); |
213 | | v_level = _mm_min_epi16 (v_level, v_max); |
214 | | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
215 | | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
216 | | |
217 | | v_level = _mm_unpacklo_epi16(v_low,v_high); |
218 | | v_level = _mm_mullo_epi32(v_level,v_scale_l); |
219 | | v_level = _mm_add_epi32(v_level,v_add); |
220 | | v_level = _mm_sra_epi32(v_level,v_rshift); |
221 | |
|
222 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
223 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
224 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width +x), v_level ); |
225 | 0 | if (maxX + 1 - x > 4) |
226 | 0 | { |
227 | 0 | v_scale_h = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4] ); |
228 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
229 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale_h); |
230 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
231 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
232 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
233 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
234 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
235 | 0 | } |
236 | 0 | } |
237 | 0 | } |
238 | 0 | } |
239 | | #endif |
240 | 0 | } |
241 | 0 | else // rightshift <0 |
242 | 0 | { |
243 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
244 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
245 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
246 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
247 | 0 | __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP); |
248 | 0 | __m128i v_lshift = _mm_set1_epi64x (-rightShift); |
249 | 0 | if (maxX<4) |
250 | 0 | { |
251 | 0 | for( int y = 0; y <= maxY; y++) |
252 | 0 | { |
253 | 0 | __m128i v_level = maxX > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] ) |
254 | 0 | : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 ) |
255 | 0 | : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 ); |
256 | |
|
257 | 0 | __m128i v_scale = maxX > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] ) |
258 | 0 | : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 ) |
259 | 0 | : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 ); |
260 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
261 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
262 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
263 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
264 | |
|
265 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
266 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
267 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
268 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
269 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
270 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
271 | 0 | } |
272 | 0 | } |
273 | | #if USE_AVX2 |
274 | 0 | else if (maxX<8) |
275 | 0 | { |
276 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
277 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
278 | 0 | for( int y = 0; y <= maxY; y++) |
279 | 0 | { |
280 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
281 | 0 | __m256i v_scale = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width] ); |
282 | | |
283 | | v_level = _mm_and_si128(v_level,vlevmask); |
284 | | v_level = _mm_max_epi16 (v_level, v_min); |
285 | | v_level = _mm_min_epi16 (v_level, v_max); |
286 | | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
287 | | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
288 | 0 | __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high)); |
289 | 0 | xv_level = _mm256_mullo_epi32(xv_level,v_scale); |
290 | 0 | xv_level = _mm256_sll_epi32(xv_level,v_lshift); |
291 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
292 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
293 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level ); |
294 | 0 | } |
295 | 0 | } |
296 | 0 | else |
297 | 0 | { |
298 | 0 | __m256i xv_max = _mm256_set1_epi16 ((short)inputMaximum); |
299 | 0 | __m256i xv_min = _mm256_set1_epi16 ((short)inputMinimum); |
300 | 0 | __m256i xv_scaleQP = _mm256_set1_epi16 ((short)scaleQP); |
301 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
302 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
303 | 0 | __m256i v_scale_l,v_scale_h; |
304 | |
|
305 | 0 | for( int y = 0; y <= maxY; y++) |
306 | 0 | { |
307 | 0 | for( int x = 0; x <= maxX; x+=16) |
308 | 0 | { |
309 | 0 | __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride] ); |
310 | 0 | v_scale_l = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x ]); |
311 | 0 | v_scale_h = _mm256_loadu_si256( ( const __m256i* ) &piDequantCoef[y * width + x + 8 ]); |
312 | |
|
313 | 0 | xv_level = _mm256_and_si256(xv_level,xvlevmask); |
314 | 0 | xv_level = _mm256_max_epi16 (xv_level, xv_min); |
315 | 0 | xv_level = _mm256_min_epi16 (xv_level, xv_max); |
316 | 0 | __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scaleQP); |
317 | 0 | __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scaleQP); |
318 | |
|
319 | 0 | xv_low = _mm256_permute4x64_epi64(xv_low,0xD8); |
320 | 0 | xv_high = _mm256_permute4x64_epi64(xv_high,0xD8); |
321 | |
|
322 | 0 | xv_level = _mm256_unpacklo_epi16(xv_low,xv_high); |
323 | 0 | xv_level = _mm256_mullo_epi32(xv_level,v_scale_l); |
324 | |
|
325 | 0 | xv_level = _mm256_sll_epi32(xv_level,v_lshift); |
326 | |
|
327 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
328 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
329 | |
|
330 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level ); |
331 | 0 | xv_level = _mm256_unpackhi_epi16(xv_low,xv_high); |
332 | 0 | xv_level = _mm256_mullo_epi32(xv_level,v_scale_h); |
333 | |
|
334 | 0 | xv_level = _mm256_sll_epi32(xv_level,v_lshift); |
335 | |
|
336 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
337 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
338 | |
|
339 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level ); |
340 | 0 | } |
341 | 0 | } |
342 | 0 | } |
343 | | #else |
344 | 0 | else |
345 | 0 | { |
346 | 0 | __m128i v_scale_l,v_scale_h; |
347 | 0 | for( int y = 0; y <= maxY; y++) |
348 | 0 | { |
349 | 0 | for( int x = 0; x <= maxX; x+=8) |
350 | 0 | { |
351 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
352 | | |
353 | | v_scale_l = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] ); |
354 | | v_level = _mm_and_si128(v_level,vlevmask); |
355 | | v_level = _mm_max_epi16 (v_level, v_min); |
356 | | v_level = _mm_min_epi16 (v_level, v_max); |
357 | | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
358 | | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
359 | | |
360 | | v_level = _mm_unpacklo_epi16(v_low,v_high); |
361 | | v_level = _mm_mullo_epi32(v_level,v_scale_l); |
362 | | v_level = _mm_sll_epi32(v_level,v_lshift); |
363 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
364 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
365 | | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
366 | |
|
367 | 0 | if (maxX + 1 - x > 4) |
368 | 0 | { |
369 | 0 | v_scale_h = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4] ); |
370 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
371 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale_h); |
372 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
373 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
374 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
375 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
376 | 0 | } |
377 | 0 | } |
378 | 0 | } |
379 | 0 | } |
380 | | #endif |
381 | 0 | } |
382 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int const*, short const*, unsigned long, int*, int, int, int) |
383 | | |
384 | | template< X86_VEXT vext> |
385 | | static void DeQuantCoreSIMD(const int maxX,const int restX,const int maxY,const int scale,const TCoeffSig*const piQCoef,const size_t piQCfStride,TCoeff *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum) |
386 | 0 | { |
387 | 0 | const int inputMinimum = -(inputMaximum+1); |
388 | 0 | const TCoeff transformMinimum = -(transformMaximum+1); |
389 | 0 | const int width = restX+maxX+1; |
390 | 0 | __m128i vlevmask; |
391 | 0 | if (maxX<7) |
392 | 0 | vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] ); |
393 | 0 | else |
394 | 0 | vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff); |
395 | |
|
396 | | #if USE_AVX2 |
397 | | __m256i xvlevmask; |
398 | 0 | if (maxX<15) |
399 | 0 | xvlevmask = _mm256_loadu_si256( ( __m256i const * )&xlevmask[15-maxX] ); |
400 | 0 | else |
401 | 0 | xvlevmask = _mm256_set_epi64x(0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff); |
402 | | |
403 | | #endif |
404 | |
|
405 | 0 | if (rightShift>0) |
406 | 0 | { |
407 | 0 | const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1); |
408 | |
|
409 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
410 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
411 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
412 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
413 | 0 | __m128i v_scale = _mm_set1_epi16 ((short)scale); |
414 | 0 | __m128i v_add = _mm_set1_epi32 (iAdd); |
415 | 0 | __m128i v_rshift = _mm_set1_epi64x (rightShift); |
416 | |
|
417 | 0 | if (maxX<4) |
418 | 0 | { |
419 | 0 | for( int y = 0; y <= maxY; y++) |
420 | 0 | { |
421 | 0 | __m128i v_level = maxX > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] ) |
422 | 0 | : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 ) |
423 | 0 | : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 ); |
424 | |
|
425 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
426 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
427 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
428 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
429 | |
|
430 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
431 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
432 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
433 | |
|
434 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
435 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
436 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
437 | 0 | } |
438 | 0 | } |
439 | | #if USE_AVX2 |
440 | 0 | else if (maxX<8) |
441 | 0 | { |
442 | 0 | __m256i xv_add = _mm256_set1_epi32 (iAdd); |
443 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
444 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
445 | | |
446 | 0 | for( int y = 0; y <= maxY; y++) |
447 | 0 | { |
448 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
449 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
450 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
451 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
452 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
453 | |
|
454 | 0 | __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high)); |
455 | |
|
456 | 0 | xv_level = _mm256_add_epi32(xv_level,xv_add); |
457 | 0 | xv_level = _mm256_sra_epi32(xv_level,v_rshift); |
458 | |
|
459 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
460 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
461 | |
|
462 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level ); |
463 | 0 | } |
464 | 0 | } |
465 | 0 | else |
466 | 0 | { |
467 | 0 | __m256i xv_max = _mm256_set1_epi16 ((short)inputMaximum); |
468 | 0 | __m256i xv_min = _mm256_set1_epi16 ((short)inputMinimum); |
469 | 0 | __m256i xv_scale = _mm256_set1_epi16 ((short)scale); |
470 | 0 | __m256i xv_add = _mm256_set1_epi32 (iAdd); |
471 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
472 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
473 | |
|
474 | 0 | for( int y = 0; y <= maxY; y++) |
475 | 0 | { |
476 | 0 | for( int x = 0; x <= maxX; x+=16) |
477 | 0 | { |
478 | 0 | __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride] ); |
479 | 0 | xv_level = _mm256_and_si256(xv_level,xvlevmask); |
480 | 0 | xv_level = _mm256_max_epi16 (xv_level, xv_min); |
481 | 0 | xv_level = _mm256_min_epi16 (xv_level, xv_max); |
482 | 0 | __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale); |
483 | 0 | __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale); |
484 | |
|
485 | 0 | xv_low = _mm256_permute4x64_epi64(xv_low,0xD8); |
486 | 0 | xv_high = _mm256_permute4x64_epi64(xv_high,0xD8); |
487 | | |
488 | |
|
489 | 0 | xv_level = _mm256_unpacklo_epi16(xv_low,xv_high); |
490 | 0 | xv_level = _mm256_add_epi32(xv_level,xv_add); |
491 | 0 | xv_level = _mm256_sra_epi32(xv_level,v_rshift); |
492 | |
|
493 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
494 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
495 | |
|
496 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level ); |
497 | 0 | xv_level = _mm256_unpackhi_epi16(xv_low,xv_high); |
498 | 0 | xv_level = _mm256_add_epi32(xv_level,xv_add); |
499 | 0 | xv_level = _mm256_sra_epi32(xv_level,v_rshift); |
500 | |
|
501 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
502 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
503 | |
|
504 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level ); |
505 | 0 | } |
506 | 0 | } |
507 | |
|
508 | 0 | } |
509 | | |
510 | | #else |
511 | | /* |
512 | | else if (maxX<8) |
513 | | { |
514 | | for( int y = 0; y <= maxY; y++) |
515 | | { |
516 | | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
517 | | v_level = _mm_and_si128(v_level,vlevmask); |
518 | | v_level = _mm_max_epi16 (v_level, v_min); |
519 | | v_level = _mm_min_epi16 (v_level, v_max); |
520 | | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
521 | | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
522 | | |
523 | | v_level = _mm_unpacklo_epi16(v_low,v_high); |
524 | | v_level = _mm_add_epi32(v_level,v_add); |
525 | | v_level = _mm_sra_epi32(v_level,v_rshift); |
526 | | |
527 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
528 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
529 | | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
530 | | |
531 | | v_level = _mm_unpackhi_epi16(v_low,v_high); |
532 | | v_level = _mm_add_epi32(v_level,v_add); |
533 | | v_level = _mm_sra_epi32(v_level,v_rshift); |
534 | | |
535 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
536 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
537 | | _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level ); |
538 | | } |
539 | | } |
540 | | */ |
541 | 0 | else |
542 | 0 | { |
543 | 0 | for( int y = 0; y <= maxY; y++) |
544 | 0 | { |
545 | 0 | for( int x = 0; x <= maxX; x+=8) |
546 | 0 | { |
547 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
548 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
549 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
550 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
551 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
552 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
553 | | |
554 | | v_level = _mm_unpacklo_epi16(v_low,v_high); |
555 | | v_level = _mm_add_epi32(v_level,v_add); |
556 | | v_level = _mm_sra_epi32(v_level,v_rshift); |
557 | | |
558 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
559 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
560 | | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
561 | | |
562 | 0 | if( maxX + 1 - x <= 4 ) continue; |
563 | | |
564 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
565 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
566 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
567 | |
|
568 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
569 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
570 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
571 | 0 | } |
572 | 0 | } |
573 | 0 | } |
574 | | #endif |
575 | 0 | } |
576 | 0 | else // rightshift <0 |
577 | 0 | { |
578 | |
|
579 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
580 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
581 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
582 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
583 | 0 | __m128i v_scale = _mm_set1_epi16 ((short)scale); |
584 | 0 | __m128i v_lshift = _mm_set1_epi64x (-rightShift); |
585 | |
|
586 | 0 | if (maxX<4) |
587 | 0 | { |
588 | 0 | for( int y = 0; y <= maxY; y++) |
589 | 0 | { |
590 | 0 | __m128i v_level = maxX > 1 ? _mm_loadu_si64( ( const __m128i* ) &piQCoef[y * piQCfStride] ) |
591 | 0 | : maxX == 1 ? _mm_setr_epi16( piQCoef[y * piQCfStride], piQCoef[y * piQCfStride + 1], 0, 0, 0, 0, 0, 0 ) |
592 | 0 | : _mm_setr_epi16( piQCoef[y * piQCfStride], 0, 0, 0, 0, 0, 0, 0 ); |
593 | |
|
594 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
595 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
596 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
597 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
598 | |
|
599 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
600 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
601 | |
|
602 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
603 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
604 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
605 | |
|
606 | 0 | } |
607 | 0 | } |
608 | | #if USE_AVX2 |
609 | 0 | else if (maxX<8) |
610 | 0 | { |
611 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
612 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
613 | | |
614 | 0 | for( int y = 0; y <= maxY; y++) |
615 | 0 | { |
616 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
617 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
618 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
619 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
620 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
621 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
622 | |
|
623 | 0 | __m256i xv_level = _mm256_set_m128i (_mm_unpackhi_epi16(v_low,v_high), _mm_unpacklo_epi16(v_low,v_high)); |
624 | |
|
625 | 0 | xv_level = _mm256_sll_epi32(xv_level,v_lshift); |
626 | |
|
627 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
628 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
629 | |
|
630 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+y*width ), xv_level ); |
631 | 0 | } |
632 | 0 | } |
633 | | |
634 | 0 | else |
635 | 0 | { |
636 | 0 | __m256i xv_max = _mm256_set1_epi16 ((short)inputMaximum); |
637 | 0 | __m256i xv_min = _mm256_set1_epi16 ((short)inputMinimum); |
638 | 0 | __m256i xv_scale = _mm256_set1_epi16 ((short)scale); |
639 | 0 | __m256i xv_Tmax = _mm256_set1_epi32 ((short)transformMaximum); |
640 | 0 | __m256i xv_Tmin = _mm256_set1_epi32 ((short)transformMinimum); |
641 | |
|
642 | 0 | for( int y = 0; y <= maxY; y++) |
643 | 0 | { |
644 | 0 | for( int x = 0; x <= maxX; x+=16) |
645 | 0 | { |
646 | 0 | __m256i xv_level = _mm256_loadu_si256( ( __m256i const * )&piQCoef[x+ y * piQCfStride] ); |
647 | 0 | xv_level = _mm256_and_si256(xv_level,xvlevmask); |
648 | 0 | xv_level = _mm256_max_epi16 (xv_level, xv_min); |
649 | 0 | xv_level = _mm256_min_epi16 (xv_level, xv_max); |
650 | 0 | __m256i xv_low = _mm256_mullo_epi16(xv_level,xv_scale); |
651 | 0 | __m256i xv_high = _mm256_mulhi_epi16(xv_level,xv_scale); |
652 | |
|
653 | 0 | xv_low = _mm256_permute4x64_epi64(xv_low,0xD8); |
654 | 0 | xv_high = _mm256_permute4x64_epi64(xv_high,0xD8); |
655 | | |
656 | |
|
657 | 0 | xv_level = _mm256_unpacklo_epi16(xv_low,xv_high); |
658 | 0 | xv_level = _mm256_sll_epi32(xv_level,v_lshift); |
659 | |
|
660 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
661 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
662 | |
|
663 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+x+y*width ), xv_level ); |
664 | 0 | xv_level = _mm256_unpackhi_epi16(xv_low,xv_high); |
665 | 0 | xv_level = _mm256_sll_epi32(xv_level,v_lshift); |
666 | |
|
667 | 0 | xv_level = _mm256_max_epi32 (xv_level, xv_Tmin); |
668 | 0 | xv_level = _mm256_min_epi32 (xv_level, xv_Tmax); |
669 | |
|
670 | 0 | _mm256_storeu_si256(( __m256i * )(piCoef+8+x+y*width ), xv_level ); |
671 | 0 | } |
672 | 0 | } |
673 | 0 | } |
674 | | |
675 | | #else |
676 | | /* |
677 | | else if (maxX<8) |
678 | | { |
679 | | for( int y = 0; y <= maxY; y++) |
680 | | { |
681 | | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
682 | | v_level = _mm_and_si128(v_level,vlevmask); |
683 | | v_level = _mm_max_epi16 (v_level, v_min); |
684 | | v_level = _mm_min_epi16 (v_level, v_max); |
685 | | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
686 | | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
687 | | |
688 | | v_level = _mm_unpacklo_epi16(v_low,v_high); |
689 | | v_level = _mm_sll_epi32(v_level,v_lshift); |
690 | | |
691 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
692 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
693 | | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
694 | | |
695 | | v_level = _mm_unpackhi_epi16(v_low,v_high); |
696 | | v_level = _mm_sll_epi32(v_level,v_lshift); |
697 | | |
698 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
699 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
700 | | _mm_storeu_si128(( __m128i * )(piCoef+4+y*width ), v_level ); |
701 | | } |
702 | | } |
703 | | */ |
704 | 0 | else |
705 | 0 | { |
706 | 0 | for( int y = 0; y <= maxY; y++) |
707 | 0 | { |
708 | 0 | for( int x = 0; x <= maxX; x+=8) |
709 | 0 | { |
710 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
711 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
712 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
713 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
714 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
715 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
716 | | |
717 | | v_level = _mm_unpacklo_epi16(v_low,v_high); |
718 | | v_level = _mm_sll_epi32(v_level,v_lshift); |
719 | | |
720 | | v_level = _mm_max_epi32 (v_level, v_Tmin); |
721 | | v_level = _mm_min_epi32 (v_level, v_Tmax); |
722 | | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
723 | | |
724 | 0 | if( maxX + 1 - x <= 4 ) continue; |
725 | | |
726 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
727 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
728 | |
|
729 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
730 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
731 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
732 | 0 | } |
733 | 0 | } |
734 | 0 | } |
735 | | #endif |
736 | 0 | } |
737 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, short const*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, short const*, unsigned long, int*, int, int, int) |
738 | | |
739 | | template< X86_VEXT vext> |
740 | | static void DeQuantCorePCMSIMD(const int maxX,const int restX,const int maxY,const int scale,TCoeff *const piQCoef,const size_t piQCfStride,TCoeff *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum) |
741 | 0 | { |
742 | 0 | const int inputMinimum = -(inputMaximum+1); |
743 | 0 | const TCoeff transformMinimum = -(transformMaximum+1); |
744 | 0 | const int width = restX+maxX+1; |
745 | 0 | __m128i vlevmask; |
746 | 0 | if (maxX<7) |
747 | 0 | vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] ); |
748 | 0 | else |
749 | 0 | vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff); |
750 | |
|
751 | 0 | if (rightShift>0) |
752 | 0 | { |
753 | 0 | const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1); |
754 | |
|
755 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
756 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
757 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
758 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
759 | 0 | __m128i v_scale = _mm_set1_epi16 ((short)scale); |
760 | 0 | __m128i v_add = _mm_set1_epi32 (iAdd); |
761 | 0 | __m128i v_rshift = _mm_set1_epi64x (rightShift); |
762 | |
|
763 | 0 | if (maxX<4) |
764 | 0 | { |
765 | 0 | for( int y = 0; y <= maxY; y++) |
766 | 0 | { |
767 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
768 | 0 | v_level = _mm_packs_epi32 (v_level,v_level); |
769 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
770 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
771 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
772 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
773 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
774 | |
|
775 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
776 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
777 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
778 | |
|
779 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
780 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
781 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
782 | 0 | } |
783 | 0 | } |
784 | 0 | else |
785 | 0 | { |
786 | 0 | for( int y = 0; y <= maxY; y++) |
787 | 0 | { |
788 | 0 | for( int x = 0; x <= maxX; x+=8) |
789 | 0 | { |
790 | |
|
791 | 0 | __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
792 | 0 | __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride] ); |
793 | 0 | __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh); |
794 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
795 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
796 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
797 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
798 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
799 | |
|
800 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
801 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
802 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
803 | |
|
804 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
805 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
806 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
807 | |
|
808 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
809 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
810 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
811 | |
|
812 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
813 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
814 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
815 | 0 | } |
816 | 0 | } |
817 | 0 | } |
818 | 0 | } |
819 | 0 | else // rightshift <0 |
820 | 0 | { |
821 | |
|
822 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
823 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
824 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
825 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
826 | 0 | __m128i v_scale = _mm_set1_epi16 ((short)scale); |
827 | 0 | __m128i v_lshift = _mm_set1_epi64x (-rightShift); |
828 | |
|
829 | 0 | if (maxX<4) |
830 | 0 | { |
831 | 0 | for( int y = 0; y <= maxY; y++) |
832 | 0 | { |
833 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
834 | 0 | v_level = _mm_packs_epi32 (v_level,v_level); |
835 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
836 | |
|
837 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
838 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
839 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
840 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
841 | |
|
842 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
843 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
844 | |
|
845 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
846 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
847 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
848 | 0 | } |
849 | 0 | } |
850 | 0 | else |
851 | 0 | { |
852 | 0 | for( int y = 0; y <= maxY; y++) |
853 | 0 | { |
854 | 0 | for( int x = 0; x <= maxX; x+=8) |
855 | 0 | { |
856 | 0 | __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
857 | 0 | __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride] ); |
858 | 0 | __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh); |
859 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
860 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
861 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
862 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scale); |
863 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scale); |
864 | |
|
865 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
866 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
867 | |
|
868 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
869 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
870 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
871 | |
|
872 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
873 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
874 | |
|
875 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
876 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
877 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
878 | 0 | } |
879 | 0 | } |
880 | 0 | } |
881 | 0 | } |
882 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantCorePCMSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantCorePCMSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int*, unsigned long, int*, int, int, int) |
883 | | |
884 | | template< X86_VEXT vext> |
885 | | static void DeQuantScalingPCMCoreSIMD(const int maxX,const int restX,const int maxY,const int scaleQP,const int *piDequantCoef,TCoeff *const piQCoef,const size_t piQCfStride,TCoeff *const piCoef,const int rightShift,const int inputMaximum,const TCoeff transformMaximum) |
886 | 0 | { |
887 | 0 | const int inputMinimum = -(inputMaximum+1); |
888 | 0 | const TCoeff transformMinimum = -(transformMaximum+1); |
889 | 0 | const int width = restX+maxX+1; |
890 | 0 | __m128i vlevmask; |
891 | 0 | if (maxX<7) |
892 | 0 | vlevmask = _mm_loadu_si128( ( __m128i const * )&levmask[7-maxX] ); |
893 | 0 | else |
894 | 0 | vlevmask = _mm_set_epi64x(0xffffffffffffffff,0xffffffffffffffff); |
895 | |
|
896 | 0 | if (rightShift>0) |
897 | 0 | { |
898 | 0 | const Intermediate_Int iAdd = (Intermediate_Int) 1 << (rightShift - 1); |
899 | |
|
900 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
901 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
902 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
903 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
904 | 0 | __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP); |
905 | 0 | __m128i v_add = _mm_set1_epi32 (iAdd); |
906 | 0 | __m128i v_rshift = _mm_set1_epi64x (rightShift); |
907 | |
|
908 | 0 | if (maxX<4) |
909 | 0 | { |
910 | 0 | for( int y = 0; y <= maxY; y++) |
911 | 0 | { |
912 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
913 | 0 | __m128i v_scale = maxX > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] ) |
914 | 0 | : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 ) |
915 | 0 | : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 ); |
916 | |
|
917 | 0 | v_level = _mm_packs_epi32 (v_level,v_level); |
918 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
919 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
920 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
921 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
922 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
923 | |
|
924 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
925 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
926 | |
|
927 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
928 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
929 | |
|
930 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
931 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
932 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
933 | 0 | } |
934 | 0 | } |
935 | 0 | else |
936 | 0 | { |
937 | 0 | for( int y = 0; y <= maxY; y++) |
938 | 0 | { |
939 | 0 | for( int x = 0; x <= maxX; x+=8) |
940 | 0 | { |
941 | 0 | __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
942 | 0 | __m128i v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] ); |
943 | 0 | __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride] ); |
944 | 0 | __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh); |
945 | |
|
946 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
947 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
948 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
949 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
950 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
951 | |
|
952 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
953 | |
|
954 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
955 | |
|
956 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
957 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
958 | |
|
959 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
960 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
961 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
962 | |
|
963 | 0 | v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4]); |
964 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
965 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
966 | 0 | v_level = _mm_add_epi32(v_level,v_add); |
967 | 0 | v_level = _mm_sra_epi32(v_level,v_rshift); |
968 | |
|
969 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
970 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
971 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
972 | 0 | } |
973 | 0 | } |
974 | 0 | } |
975 | 0 | } |
976 | 0 | else // rightshift <0 |
977 | 0 | { |
978 | 0 | __m128i v_max = _mm_set1_epi16 ((short)inputMaximum); |
979 | 0 | __m128i v_min = _mm_set1_epi16 ((short)inputMinimum); |
980 | 0 | __m128i v_Tmax = _mm_set1_epi32 ((short)transformMaximum); |
981 | 0 | __m128i v_Tmin = _mm_set1_epi32 ((short)transformMinimum); |
982 | 0 | __m128i v_scaleQP = _mm_set1_epi16 ((short)scaleQP); |
983 | 0 | __m128i v_lshift = _mm_set1_epi64x (-rightShift); |
984 | |
|
985 | 0 | if (maxX<4) |
986 | 0 | { |
987 | 0 | for( int y = 0; y <= maxY; y++) |
988 | 0 | { |
989 | 0 | __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride] ); |
990 | 0 | __m128i v_scale = maxX > 1 ? _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width] ) |
991 | 0 | : maxX == 1 ? _mm_setr_epi32( piDequantCoef[y * width], piDequantCoef[y * width + 1], 0, 0 ) |
992 | 0 | : _mm_setr_epi32( piDequantCoef[y * width], 0, 0, 0 ); |
993 | |
|
994 | 0 | v_level = _mm_packs_epi32 (v_level,v_level); |
995 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
996 | |
|
997 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
998 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
999 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
1000 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
1001 | |
|
1002 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
1003 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
1004 | |
|
1005 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
1006 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
1007 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
1008 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level ); |
1009 | 0 | } |
1010 | 0 | } |
1011 | 0 | else |
1012 | 0 | { |
1013 | 0 | for( int y = 0; y <= maxY; y++) |
1014 | 0 | { |
1015 | 0 | for( int x = 0; x <= maxX; x+=8) |
1016 | 0 | { |
1017 | 0 | __m128i v_levell = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+ y * piQCfStride] ); |
1018 | 0 | __m128i v_levelh = _mm_loadu_si128( ( __m128i const * )&piQCoef[x+4 + y * piQCfStride] ); |
1019 | 0 | __m128i v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x] ); |
1020 | |
|
1021 | 0 | __m128i v_level = _mm_packs_epi32 (v_levell,v_levelh); |
1022 | 0 | v_level = _mm_and_si128(v_level,vlevmask); |
1023 | 0 | v_level = _mm_max_epi16 (v_level, v_min); |
1024 | 0 | v_level = _mm_min_epi16 (v_level, v_max); |
1025 | 0 | __m128i v_low = _mm_mullo_epi16(v_level,v_scaleQP); |
1026 | 0 | __m128i v_high = _mm_mulhi_epi16(v_level,v_scaleQP); |
1027 | |
|
1028 | 0 | v_level = _mm_unpacklo_epi16(v_low,v_high); |
1029 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
1030 | |
|
1031 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
1032 | |
|
1033 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
1034 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
1035 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+x+y*width ), v_level ); |
1036 | |
|
1037 | 0 | v_scale = _mm_loadu_si128( ( const __m128i* ) &piDequantCoef[y * width+x+4]); |
1038 | |
|
1039 | 0 | v_level = _mm_unpackhi_epi16(v_low,v_high); |
1040 | 0 | v_level = _mm_mullo_epi32(v_level,v_scale); |
1041 | |
|
1042 | 0 | v_level = _mm_sll_epi32(v_level,v_lshift); |
1043 | 0 | v_level = _mm_max_epi32 (v_level, v_Tmin); |
1044 | 0 | v_level = _mm_min_epi32 (v_level, v_Tmax); |
1045 | 0 | _mm_storeu_si128(( __m128i * )(piCoef+4+x+y*width ), v_level ); |
1046 | 0 | } |
1047 | 0 | } |
1048 | 0 | } |
1049 | 0 | } |
1050 | 0 | } Unexecuted instantiation: Quant_sse41.cpp:void vvdec::DeQuantScalingPCMCoreSIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, int, int, int const*, int*, unsigned long, int*, int, int, int) Unexecuted instantiation: Quant_avx2.cpp:void vvdec::DeQuantScalingPCMCoreSIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, int, int, int const*, int*, unsigned long, int*, int, int, int) |
1051 | | |
1052 | | |
1053 | | |
1054 | | |
1055 | | template<X86_VEXT vext> |
1056 | | void Quant::_initQuantX86() |
1057 | 0 | { |
1058 | 0 | DeQuant = DeQuantCoreSIMD<vext>; |
1059 | 0 | DeQuantPCM = DeQuantCorePCMSIMD<vext>; |
1060 | 0 | DeQuantScaling = DeQuantScalingCoreSIMD<vext>; |
1061 | 0 | DeQuantScalingPCM = DeQuantScalingPCMCoreSIMD<vext>; |
1062 | |
|
1063 | 0 | } Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvdec::Quant::_initQuantX86<(vvdec::x86_simd::X86_VEXT)4>() |
1064 | | template void Quant::_initQuantX86<SIMDX86>(); |
1065 | | |
1066 | | #endif // TARGET_SIMD_X86 |
1067 | | #endif |
1068 | | |
1069 | | } |