/src/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <immintrin.h> // AVX2 |
13 | | |
14 | | #include "./vp9_rtcd.h" |
15 | | #include "vpx/vpx_integer.h" |
16 | | #include "vpx_dsp/vpx_dsp_common.h" |
17 | | #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" |
18 | | #include "vpx_dsp/x86/quantize_sse2.h" |
19 | | #include "vp9/common/vp9_scan.h" |
20 | | #include "vp9/encoder/vp9_block.h" |
21 | | |
22 | | // Zero fill 8 positions in the output buffer. |
23 | 0 | static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { |
24 | 0 | const __m256i zero = _mm256_setzero_si256(); |
25 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
26 | 0 | _mm256_storeu_si256((__m256i *)(a), zero); |
27 | 0 | _mm256_storeu_si256((__m256i *)(a + 8), zero); |
28 | | #else |
29 | | _mm256_storeu_si256((__m256i *)(a), zero); |
30 | | #endif |
31 | 0 | } |
32 | | |
33 | | static VPX_FORCE_INLINE void load_fp_values_avx2( |
34 | | const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, |
35 | 0 | const int16_t *dequant_ptr, __m256i *dequant) { |
36 | 0 | *round = _mm256_castsi128_si256( |
37 | 0 | _mm_load_si128((const __m128i *)mb_plane->round_fp)); |
38 | 0 | *round = _mm256_permute4x64_epi64(*round, 0x54); |
39 | 0 | *quant = _mm256_castsi128_si256( |
40 | 0 | _mm_load_si128((const __m128i *)mb_plane->quant_fp)); |
41 | 0 | *quant = _mm256_permute4x64_epi64(*quant, 0x54); |
42 | 0 | *dequant = |
43 | 0 | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); |
44 | 0 | *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); |
45 | 0 | } |
46 | | |
47 | | static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, |
48 | | __m256i v_eobmax, |
49 | 0 | __m256i v_mask) { |
50 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
51 | 0 | const __m256i v_iscan = _mm256_permute4x64_epi64( |
52 | 0 | _mm256_loadu_si256((const __m256i *)iscan), 0xD8); |
53 | | #else |
54 | | const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); |
55 | | #endif |
56 | 0 | const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); |
57 | 0 | return _mm256_max_epi16(v_eobmax, v_nz_iscan); |
58 | 0 | } |
59 | | |
60 | 0 | static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { |
61 | 0 | const __m256i eob_lo = eob256; |
62 | | // Copy upper 128 to lower 128 |
63 | 0 | const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81); |
64 | 0 | __m256i eob = _mm256_max_epi16(eob_lo, eob_hi); |
65 | 0 | __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe); |
66 | 0 | eob = _mm256_max_epi16(eob, eob_s); |
67 | 0 | eob_s = _mm256_shufflelo_epi16(eob, 0xe); |
68 | 0 | eob = _mm256_max_epi16(eob, eob_s); |
69 | 0 | eob_s = _mm256_shufflelo_epi16(eob, 1); |
70 | 0 | eob = _mm256_max_epi16(eob, eob_s); |
71 | | #if defined(_MSC_VER) && (_MSC_VER < 1910) |
72 | | return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; |
73 | | #else |
74 | 0 | return (uint16_t)_mm256_extract_epi16(eob, 0); |
75 | 0 | #endif |
76 | 0 | } |
77 | | |
78 | | static VPX_FORCE_INLINE void quantize_fp_16( |
79 | | const __m256i *round, const __m256i *quant, const __m256i *dequant, |
80 | | const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, |
81 | 0 | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { |
82 | 0 | const __m256i coeff = load_tran_low(coeff_ptr); |
83 | 0 | const __m256i abs_coeff = _mm256_abs_epi16(coeff); |
84 | 0 | const int32_t nzflag = |
85 | 0 | _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr)); |
86 | |
|
87 | 0 | if (nzflag) { |
88 | 0 | const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); |
89 | 0 | const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); |
90 | 0 | const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); |
91 | 0 | const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant); |
92 | 0 | const __m256i nz_mask = |
93 | 0 | _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); |
94 | 0 | store_tran_low(qcoeff, qcoeff_ptr); |
95 | 0 | store_tran_low(dqcoeff, dqcoeff_ptr); |
96 | |
|
97 | 0 | *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); |
98 | 0 | } else { |
99 | 0 | store_zero_tran_low(qcoeff_ptr); |
100 | 0 | store_zero_tran_low(dqcoeff_ptr); |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, |
105 | | const struct macroblock_plane *const mb_plane, |
106 | | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
107 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
108 | 0 | const struct ScanOrder *const scan_order) { |
109 | 0 | __m256i round, quant, dequant, thr; |
110 | 0 | __m256i eob_max = _mm256_setzero_si256(); |
111 | 0 | const int16_t *iscan = scan_order->iscan; |
112 | |
|
113 | 0 | coeff_ptr += n_coeffs; |
114 | 0 | iscan += n_coeffs; |
115 | 0 | qcoeff_ptr += n_coeffs; |
116 | 0 | dqcoeff_ptr += n_coeffs; |
117 | 0 | n_coeffs = -n_coeffs; |
118 | | |
119 | | // Setup global values |
120 | 0 | load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); |
121 | 0 | thr = _mm256_setzero_si256(); |
122 | |
|
123 | 0 | quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, |
124 | 0 | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
125 | 0 | dqcoeff_ptr + n_coeffs, &eob_max); |
126 | |
|
127 | 0 | n_coeffs += 8 * 2; |
128 | | |
129 | | // remove dc constants |
130 | 0 | dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); |
131 | 0 | quant = _mm256_permute2x128_si256(quant, quant, 0x31); |
132 | 0 | round = _mm256_permute2x128_si256(round, round, 0x31); |
133 | 0 | thr = _mm256_srai_epi16(dequant, 1); |
134 | | |
135 | | // AC only loop |
136 | 0 | while (n_coeffs < 0) { |
137 | 0 | quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, |
138 | 0 | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
139 | 0 | dqcoeff_ptr + n_coeffs, &eob_max); |
140 | 0 | n_coeffs += 8 * 2; |
141 | 0 | } |
142 | |
|
143 | 0 | *eob_ptr = get_max_eob(eob_max); |
144 | 0 | } |
145 | | |
146 | | // Enable this flag when matching the optimized code to |
147 | | // vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the |
148 | | // existing ssse3 code and quantize_fp_32x32_nz_c(). |
149 | | // |
150 | | // #define MATCH_VP9_QUANTIZE_FP_32X32_C |
151 | | |
152 | | #ifndef MATCH_VP9_QUANTIZE_FP_32X32_C |
153 | | static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag( |
154 | | const __m256i *round, const __m256i *quant, const __m256i *dequant, |
155 | | const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, |
156 | 0 | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { |
157 | 0 | const __m256i coeff = load_tran_low(coeff_ptr); |
158 | 0 | const __m256i abs_coeff = _mm256_abs_epi16(coeff); |
159 | 0 | const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); |
160 | 0 | const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); |
161 | 0 | const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); |
162 | 0 | const __m256i abs_dqcoeff = |
163 | 0 | _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); |
164 | 0 | const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); |
165 | 0 | const __m256i nz_mask = |
166 | 0 | _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); |
167 | 0 | store_tran_low(qcoeff, qcoeff_ptr); |
168 | 0 | store_tran_low(dqcoeff, dqcoeff_ptr); |
169 | |
|
170 | 0 | *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); |
171 | 0 | (void)thr; |
172 | 0 | } |
173 | | #endif |
174 | | |
175 | | static VPX_FORCE_INLINE void quantize_fp_32x32_16( |
176 | | const __m256i *round, const __m256i *quant, const __m256i *dequant, |
177 | | const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, |
178 | 0 | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { |
179 | 0 | const __m256i coeff = load_tran_low(coeff_ptr); |
180 | 0 | const __m256i abs_coeff = _mm256_abs_epi16(coeff); |
181 | 0 | const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr); |
182 | 0 | const int32_t nzflag = _mm256_movemask_epi8(thr_mask); |
183 | |
|
184 | 0 | if (nzflag) { |
185 | | #ifdef MATCH_VP9_QUANTIZE_FP_32X32_C |
186 | | const __m256i tmp_rnd = |
187 | | _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask); |
188 | | #else |
189 | 0 | const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); |
190 | 0 | #endif |
191 | 0 | const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); |
192 | 0 | const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); |
193 | 0 | const __m256i abs_dqcoeff = |
194 | 0 | _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); |
195 | 0 | const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); |
196 | 0 | const __m256i nz_mask = |
197 | 0 | _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); |
198 | 0 | store_tran_low(qcoeff, qcoeff_ptr); |
199 | 0 | store_tran_low(dqcoeff, dqcoeff_ptr); |
200 | |
|
201 | 0 | *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); |
202 | 0 | } else { |
203 | 0 | store_zero_tran_low(qcoeff_ptr); |
204 | 0 | store_zero_tran_low(dqcoeff_ptr); |
205 | 0 | } |
206 | 0 | } |
207 | | |
208 | | void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, |
209 | | const struct macroblock_plane *const mb_plane, |
210 | | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
211 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
212 | 0 | const struct ScanOrder *const scan_order) { |
213 | 0 | __m256i round, quant, dequant, thr; |
214 | 0 | __m256i eob_max = _mm256_setzero_si256(); |
215 | 0 | const int16_t *iscan = scan_order->iscan; |
216 | |
|
217 | 0 | coeff_ptr += n_coeffs; |
218 | 0 | iscan += n_coeffs; |
219 | 0 | qcoeff_ptr += n_coeffs; |
220 | 0 | dqcoeff_ptr += n_coeffs; |
221 | 0 | n_coeffs = -n_coeffs; |
222 | | |
223 | | // Setup global values |
224 | 0 | load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); |
225 | 0 | thr = _mm256_srli_epi16(dequant, 2); |
226 | 0 | quant = _mm256_slli_epi16(quant, 1); |
227 | 0 | { |
228 | 0 | const __m256i rnd = _mm256_set1_epi16((int16_t)1); |
229 | 0 | round = _mm256_add_epi16(round, rnd); |
230 | 0 | round = _mm256_srai_epi16(round, 1); |
231 | 0 | } |
232 | |
|
233 | | #ifdef MATCH_VP9_QUANTIZE_FP_32X32_C |
234 | | // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when |
235 | | // calculating the zbin mask. |
236 | | thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1)); |
237 | | quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, |
238 | | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
239 | | dqcoeff_ptr + n_coeffs, &eob_max); |
240 | | #else |
241 | 0 | quantize_fp_32x32_16_no_nzflag( |
242 | 0 | &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, |
243 | 0 | qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); |
244 | 0 | #endif |
245 | |
|
246 | 0 | n_coeffs += 8 * 2; |
247 | | |
248 | | // remove dc constants |
249 | 0 | dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); |
250 | 0 | quant = _mm256_permute2x128_si256(quant, quant, 0x31); |
251 | 0 | round = _mm256_permute2x128_si256(round, round, 0x31); |
252 | 0 | thr = _mm256_permute2x128_si256(thr, thr, 0x31); |
253 | | |
254 | | // AC only loop |
255 | 0 | while (n_coeffs < 0) { |
256 | 0 | quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, |
257 | 0 | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
258 | 0 | dqcoeff_ptr + n_coeffs, &eob_max); |
259 | 0 | n_coeffs += 8 * 2; |
260 | 0 | } |
261 | |
|
262 | 0 | *eob_ptr = get_max_eob(eob_max); |
263 | 0 | } |
264 | | |
265 | | #if CONFIG_VP9_HIGHBITDEPTH |
266 | | static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, |
267 | | const __m256i *y, |
268 | 0 | int log_scale) { |
269 | 0 | __m256i prod_lo = _mm256_mul_epi32(*x, *y); |
270 | 0 | __m256i prod_hi = _mm256_srli_epi64(*x, 32); |
271 | 0 | const __m256i mult_hi = _mm256_srli_epi64(*y, 32); |
272 | 0 | const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); |
273 | 0 | prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); |
274 | 0 | prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); |
275 | 0 | prod_lo = _mm256_and_si256(prod_lo, mask); |
276 | 0 | prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); |
277 | 0 | prod_hi = _mm256_slli_epi64(prod_hi, 32); |
278 | 0 | return _mm256_or_si256(prod_lo, prod_hi); |
279 | 0 | } |
280 | | |
281 | 0 | static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { |
282 | 0 | const __m128i v = _mm_load_si128((const __m128i *)val_ptr); |
283 | 0 | const __m128i zero = _mm_setzero_si128(); |
284 | 0 | const __m128i dc = _mm_unpacklo_epi16(v, zero); |
285 | 0 | const __m128i ac = _mm_unpackhi_epi16(v, zero); |
286 | 0 | return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); |
287 | 0 | } |
288 | | |
289 | | static VPX_FORCE_INLINE void highbd_load_fp_values( |
290 | | const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, |
291 | 0 | const int16_t *dequant_ptr, __m256i *dequant) { |
292 | 0 | *round = highbd_init_256(mb_plane->round_fp); |
293 | 0 | *quant = highbd_init_256(mb_plane->quant_fp); |
294 | 0 | *dequant = highbd_init_256(dequant_ptr); |
295 | 0 | } |
296 | | |
297 | | static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( |
298 | 0 | const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { |
299 | 0 | const __m256i packed_nz_mask = |
300 | 0 | _mm256_packs_epi32(nz_mask, _mm256_setzero_si256()); |
301 | 0 | const __m256i packed_nz_mask_perm = |
302 | 0 | _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); |
303 | 0 | const __m256i iscan = |
304 | 0 | _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); |
305 | 0 | const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); |
306 | 0 | return _mm256_max_epi16(eobmax, nz_iscan); |
307 | 0 | } |
308 | | |
309 | | static VPX_FORCE_INLINE void highbd_quantize_fp( |
310 | | const __m256i *round, const __m256i *quant, const __m256i *dequant, |
311 | | const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, |
312 | 0 | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { |
313 | 0 | const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); |
314 | 0 | const __m256i abs_coeff = _mm256_abs_epi32(coeff); |
315 | 0 | const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); |
316 | 0 | const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); |
317 | 0 | const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); |
318 | 0 | const __m256i q = _mm256_sign_epi32(abs_q, coeff); |
319 | 0 | const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); |
320 | 0 | const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); |
321 | |
|
322 | 0 | _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); |
323 | 0 | _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); |
324 | |
|
325 | 0 | *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); |
326 | 0 | } |
327 | | |
328 | | void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, |
329 | | const struct macroblock_plane *const mb_plane, |
330 | | tran_low_t *qcoeff_ptr, |
331 | | tran_low_t *dqcoeff_ptr, |
332 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
333 | 0 | const struct ScanOrder *const scan_order) { |
334 | 0 | const int step = 8; |
335 | 0 | __m256i round, quant, dequant; |
336 | 0 | __m256i eob_max = _mm256_setzero_si256(); |
337 | 0 | const int16_t *iscan = scan_order->iscan; |
338 | |
|
339 | 0 | coeff_ptr += n_coeffs; |
340 | 0 | iscan += n_coeffs; |
341 | 0 | qcoeff_ptr += n_coeffs; |
342 | 0 | dqcoeff_ptr += n_coeffs; |
343 | 0 | n_coeffs = -n_coeffs; |
344 | | |
345 | | // Setup global values |
346 | 0 | highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); |
347 | |
|
348 | 0 | highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, |
349 | 0 | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
350 | 0 | dqcoeff_ptr + n_coeffs, &eob_max); |
351 | |
|
352 | 0 | n_coeffs += step; |
353 | | |
354 | | // remove dc constants |
355 | 0 | dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); |
356 | 0 | quant = _mm256_permute2x128_si256(quant, quant, 0x31); |
357 | 0 | round = _mm256_permute2x128_si256(round, round, 0x31); |
358 | | |
359 | | // AC only loop |
360 | 0 | while (n_coeffs < 0) { |
361 | 0 | highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, |
362 | 0 | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
363 | 0 | dqcoeff_ptr + n_coeffs, &eob_max); |
364 | 0 | n_coeffs += step; |
365 | 0 | } |
366 | |
|
367 | 0 | *eob_ptr = get_max_eob(eob_max); |
368 | 0 | } |
369 | | |
370 | | static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( |
371 | | const __m256i *round, const __m256i *quant, const __m256i *dequant, |
372 | | const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, |
373 | 0 | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { |
374 | 0 | const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); |
375 | 0 | const __m256i abs_coeff = _mm256_abs_epi32(coeff); |
376 | 0 | const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr); |
377 | 0 | const __m256i tmp_rnd = |
378 | 0 | _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask); |
379 | 0 | const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); |
380 | 0 | const __m256i abs_dq = |
381 | 0 | _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1); |
382 | 0 | const __m256i q = _mm256_sign_epi32(abs_q, coeff); |
383 | 0 | const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); |
384 | 0 | const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); |
385 | |
|
386 | 0 | _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); |
387 | 0 | _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); |
388 | |
|
389 | 0 | *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); |
390 | 0 | } |
391 | | |
392 | | void vp9_highbd_quantize_fp_32x32_avx2( |
393 | | const tran_low_t *coeff_ptr, intptr_t n_coeffs, |
394 | | const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, |
395 | | tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, |
396 | 0 | const struct ScanOrder *const scan_order) { |
397 | 0 | const int step = 8; |
398 | 0 | __m256i round, quant, dequant, thr; |
399 | 0 | __m256i eob_max = _mm256_setzero_si256(); |
400 | 0 | const int16_t *iscan = scan_order->iscan; |
401 | |
|
402 | 0 | coeff_ptr += n_coeffs; |
403 | 0 | iscan += n_coeffs; |
404 | 0 | qcoeff_ptr += n_coeffs; |
405 | 0 | dqcoeff_ptr += n_coeffs; |
406 | 0 | n_coeffs = -n_coeffs; |
407 | | |
408 | | // Setup global values |
409 | 0 | highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); |
410 | 0 | thr = _mm256_srli_epi32(dequant, 2); |
411 | | // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when |
412 | | // calculating the zbin mask. |
413 | 0 | thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1)); |
414 | 0 | quant = _mm256_slli_epi32(quant, 1); |
415 | 0 | round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1); |
416 | |
|
417 | 0 | highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, |
418 | 0 | iscan + n_coeffs, qcoeff_ptr + n_coeffs, |
419 | 0 | dqcoeff_ptr + n_coeffs, &eob_max); |
420 | |
|
421 | 0 | n_coeffs += step; |
422 | | |
423 | | // remove dc constants |
424 | 0 | dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); |
425 | 0 | quant = _mm256_permute2x128_si256(quant, quant, 0x31); |
426 | 0 | round = _mm256_permute2x128_si256(round, round, 0x31); |
427 | 0 | thr = _mm256_permute2x128_si256(thr, thr, 0x31); |
428 | | |
429 | | // AC only loop |
430 | 0 | while (n_coeffs < 0) { |
431 | 0 | highbd_quantize_fp_32x32( |
432 | 0 | &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, |
433 | 0 | qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); |
434 | 0 | n_coeffs += step; |
435 | 0 | } |
436 | |
|
437 | 0 | *eob_ptr = get_max_eob(eob_max); |
438 | 0 | } |
439 | | #endif // CONFIG_VP9_HIGHBITDEPTH |