/src/libvpx/vpx_dsp/x86/quantize_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | #include "vpx/vpx_integer.h" |
16 | | #include "vp9/common/vp9_scan.h" |
17 | | #include "vp9/encoder/vp9_block.h" |
18 | | |
19 | | static VPX_FORCE_INLINE void load_b_values_avx2( |
20 | | const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, |
21 | | __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, |
22 | 585M | __m256i *shift, int log_scale) { |
23 | 585M | *zbin = |
24 | 585M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); |
25 | 585M | *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); |
26 | 585M | if (log_scale > 0) { |
27 | 5.12M | const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); |
28 | 5.12M | *zbin = _mm256_add_epi16(*zbin, rnd); |
29 | 5.12M | *zbin = _mm256_srai_epi16(*zbin, log_scale); |
30 | 5.12M | } |
31 | | // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when |
32 | | // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) |
33 | 585M | *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); |
34 | | |
35 | 585M | *round = |
36 | 585M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); |
37 | 585M | *round = _mm256_permute4x64_epi64(*round, 0x54); |
38 | 585M | if (log_scale > 0) { |
39 | 5.12M | const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); |
40 | 5.12M | *round = _mm256_add_epi16(*round, rnd); |
41 | 5.12M | *round = _mm256_srai_epi16(*round, log_scale); |
42 | 5.12M | } |
43 | | |
44 | 585M | *quant = |
45 | 585M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); |
46 | 585M | *quant = _mm256_permute4x64_epi64(*quant, 0x54); |
47 | 585M | *dequant = |
48 | 585M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); |
49 | 585M | *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); |
50 | 585M | *shift = _mm256_castsi128_si256( |
51 | 585M | _mm_load_si128((const __m128i *)mb_plane->quant_shift)); |
52 | 585M | *shift = _mm256_permute4x64_epi64(*shift, 0x54); |
53 | 585M | } |
54 | | |
55 | | static VPX_FORCE_INLINE __m256i |
56 | 1.47G | load_coefficients_avx2(const tran_low_t *coeff_ptr) { |
57 | 1.47G | #if CONFIG_VP9_HIGHBITDEPTH |
58 | | // typedef int32_t tran_low_t; |
59 | 1.47G | const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); |
60 | 1.47G | const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); |
61 | 1.47G | return _mm256_packs_epi32(coeff1, coeff2); |
62 | | #else |
63 | | // typedef int16_t tran_low_t; |
64 | | return _mm256_loadu_si256((const __m256i *)coeff_ptr); |
65 | | #endif |
66 | 1.47G | } |
67 | | |
68 | | static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, |
69 | 1.03G | tran_low_t *coeff_ptr) { |
70 | 1.03G | #if CONFIG_VP9_HIGHBITDEPTH |
71 | | // typedef int32_t tran_low_t; |
72 | 1.03G | __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); |
73 | 1.03G | __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); |
74 | 1.03G | __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); |
75 | 1.03G | _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); |
76 | 1.03G | _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); |
77 | | #else |
78 | | // typedef int16_t tran_low_t; |
79 | | _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); |
80 | | #endif |
81 | 1.03G | } |
82 | | |
83 | | static VPX_FORCE_INLINE __m256i |
84 | | quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, |
85 | | tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, |
86 | 1.14G | __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { |
87 | 1.14G | const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); |
88 | 1.14G | const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); |
89 | 1.14G | const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); |
90 | | |
91 | 1.14G | if (_mm256_movemask_epi8(v_zbin_mask) == 0) { |
92 | 295M | _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); |
93 | 295M | _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); |
94 | 295M | #if CONFIG_VP9_HIGHBITDEPTH |
95 | 295M | _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); |
96 | 295M | _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); |
97 | 295M | #endif // CONFIG_VP9_HIGHBITDEPTH |
98 | 295M | return _mm256_setzero_si256(); |
99 | 295M | } |
100 | 852M | { |
101 | | // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 |
102 | 852M | const __m256i v_tmp_rnd = |
103 | 852M | _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); |
104 | | |
105 | 852M | const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); |
106 | 852M | const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); |
107 | 852M | const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); |
108 | 852M | const __m256i v_nz_mask = |
109 | 852M | _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); |
110 | 852M | const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); |
111 | 852M | #if CONFIG_VP9_HIGHBITDEPTH |
112 | 852M | const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); |
113 | 852M | const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); |
114 | | |
115 | 852M | const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); |
116 | 852M | const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); |
117 | | #else |
118 | | const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); |
119 | | #endif |
120 | | |
121 | 852M | store_coefficients_avx2(v_qcoeff, qcoeff_ptr); |
122 | 852M | #if CONFIG_VP9_HIGHBITDEPTH |
123 | 852M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); |
124 | 852M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); |
125 | | #else |
126 | | store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); |
127 | | #endif |
128 | 852M | return v_nz_mask; |
129 | 1.14G | } |
130 | 1.14G | } |
131 | | |
132 | | static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, |
133 | | __m256i v_eobmax, |
134 | 1.32G | __m256i v_mask) { |
135 | 1.32G | #if CONFIG_VP9_HIGHBITDEPTH |
136 | 1.32G | const __m256i v_iscan = _mm256_permute4x64_epi64( |
137 | 1.32G | _mm256_loadu_si256((const __m256i *)iscan), 0xD8); |
138 | | #else |
139 | | const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); |
140 | | #endif |
141 | 1.32G | const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); |
142 | 1.32G | return _mm256_max_epi16(v_eobmax, v_nz_iscan); |
143 | 1.32G | } |
144 | | |
145 | 585M | static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { |
146 | 585M | const __m128i eob_lo = _mm256_castsi256_si128(eob256); |
147 | 585M | const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); |
148 | 585M | __m128i eob = _mm_max_epi16(eob_lo, eob_hi); |
149 | 585M | __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); |
150 | 585M | eob = _mm_max_epi16(eob, eob_shuffled); |
151 | 585M | eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); |
152 | 585M | eob = _mm_max_epi16(eob, eob_shuffled); |
153 | 585M | eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); |
154 | 585M | eob = _mm_max_epi16(eob, eob_shuffled); |
155 | 585M | return _mm_extract_epi16(eob, 1); |
156 | 585M | } |
157 | | |
158 | | void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, |
159 | | const struct macroblock_plane *const mb_plane, |
160 | | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
161 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
162 | 579M | const struct ScanOrder *const scan_order) { |
163 | 579M | __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; |
164 | 579M | __m256i v_eobmax = _mm256_setzero_si256(); |
165 | 579M | intptr_t count; |
166 | 579M | const int16_t *iscan = scan_order->iscan; |
167 | | |
168 | 579M | load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, |
169 | 579M | &v_dequant, &v_quant_shift, 0); |
170 | | // Do DC and first 15 AC. |
171 | 579M | v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, |
172 | 579M | &v_dequant, &v_round, &v_zbin, &v_quant_shift); |
173 | | |
174 | 579M | v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); |
175 | | |
176 | 579M | v_round = _mm256_unpackhi_epi64(v_round, v_round); |
177 | 579M | v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); |
178 | 579M | v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); |
179 | 579M | v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); |
180 | 579M | v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); |
181 | | |
182 | 1.14G | for (count = n_coeffs - 16; count > 0; count -= 16) { |
183 | 568M | coeff_ptr += 16; |
184 | 568M | qcoeff_ptr += 16; |
185 | 568M | dqcoeff_ptr += 16; |
186 | 568M | iscan += 16; |
187 | 568M | v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, |
188 | 568M | &v_dequant, &v_round, &v_zbin, &v_quant_shift); |
189 | | |
190 | 568M | v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); |
191 | 568M | } |
192 | | |
193 | 579M | *eob_ptr = accumulate_eob256(v_eobmax); |
194 | 579M | } |
195 | | |
196 | | static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( |
197 | | const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, |
198 | | tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, |
199 | | __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, |
200 | 328M | __m256i *v_quant_shift, __m256i *v_eobmax) { |
201 | 328M | const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); |
202 | 328M | const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); |
203 | 328M | const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); |
204 | | |
205 | 328M | if (_mm256_movemask_epi8(v_zbin_mask) == 0) { |
206 | 146M | _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); |
207 | 146M | _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); |
208 | 146M | #if CONFIG_VP9_HIGHBITDEPTH |
209 | 146M | _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); |
210 | 146M | _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); |
211 | 146M | #endif |
212 | 146M | return *v_eobmax; |
213 | 146M | } |
214 | 181M | { |
215 | | // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 |
216 | 181M | const __m256i v_tmp_rnd = |
217 | 181M | _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); |
218 | | // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * |
219 | | // quant_shift_ptr[rc != 0]) >> 15); |
220 | 181M | const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); |
221 | 181M | const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); |
222 | 181M | const __m256i v_tmp32_hi = |
223 | 181M | _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); |
224 | 181M | const __m256i v_tmp32_lo = |
225 | 181M | _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); |
226 | 181M | const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); |
227 | 181M | const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); |
228 | 181M | const __m256i v_sign_lo = |
229 | 181M | _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); |
230 | 181M | const __m256i v_sign_hi = |
231 | 181M | _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); |
232 | 181M | const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); |
233 | 181M | const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); |
234 | 181M | const __m256i v_dqcoeff_lo = _mm256_sign_epi32( |
235 | 181M | _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); |
236 | 181M | const __m256i v_dqcoeff_hi = _mm256_sign_epi32( |
237 | 181M | _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); |
238 | 181M | const __m256i v_nz_mask = |
239 | 181M | _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); |
240 | | |
241 | 181M | store_coefficients_avx2(v_qcoeff, qcoeff_ptr); |
242 | | |
243 | 181M | #if CONFIG_VP9_HIGHBITDEPTH |
244 | 181M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); |
245 | 181M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); |
246 | | #else |
247 | | store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), |
248 | | dqcoeff_ptr); |
249 | | #endif |
250 | | |
251 | 181M | return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); |
252 | 328M | } |
253 | 328M | } |
254 | | |
255 | | void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, |
256 | | const struct macroblock_plane *const mb_plane, |
257 | | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
258 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
259 | 5.12M | const struct ScanOrder *const scan_order) { |
260 | 5.12M | __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; |
261 | 5.12M | __m256i v_eobmax = _mm256_setzero_si256(); |
262 | 5.12M | intptr_t count; |
263 | 5.12M | const int16_t *iscan = scan_order->iscan; |
264 | | |
265 | 5.12M | load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, |
266 | 5.12M | &v_dequant, &v_quant_shift, 1); |
267 | | |
268 | | // Do DC and first 15 AC. |
269 | 5.12M | v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, |
270 | 5.12M | &v_quant, &v_dequant, &v_round, &v_zbin, |
271 | 5.12M | &v_quant_shift, &v_eobmax); |
272 | | |
273 | 5.12M | v_round = _mm256_unpackhi_epi64(v_round, v_round); |
274 | 5.12M | v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); |
275 | 5.12M | v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); |
276 | 5.12M | v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); |
277 | 5.12M | v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); |
278 | | |
279 | 328M | for (count = (32 * 32) - 16; count > 0; count -= 16) { |
280 | 322M | coeff_ptr += 16; |
281 | 322M | qcoeff_ptr += 16; |
282 | 322M | dqcoeff_ptr += 16; |
283 | 322M | iscan += 16; |
284 | 322M | v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, |
285 | 322M | &v_quant, &v_dequant, &v_round, &v_zbin, |
286 | 322M | &v_quant_shift, &v_eobmax); |
287 | 322M | } |
288 | | |
289 | 5.12M | *eob_ptr = accumulate_eob256(v_eobmax); |
290 | 5.12M | } |