/src/libvpx/vpx_dsp/x86/quantize_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | #include "vpx/vpx_integer.h" |
16 | | #include "vp9/common/vp9_scan.h" |
17 | | #include "vp9/encoder/vp9_block.h" |
18 | | |
19 | | static VPX_FORCE_INLINE void load_b_values_avx2( |
20 | | const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, |
21 | | __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, |
22 | 508M | __m256i *shift, int log_scale) { |
23 | 508M | *zbin = |
24 | 508M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); |
25 | 508M | *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); |
26 | 508M | if (log_scale > 0) { |
27 | 3.70M | const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); |
28 | 3.70M | *zbin = _mm256_add_epi16(*zbin, rnd); |
29 | 3.70M | *zbin = _mm256_srai_epi16(*zbin, log_scale); |
30 | 3.70M | } |
31 | | // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when |
32 | | // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) |
33 | 508M | *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); |
34 | | |
35 | 508M | *round = |
36 | 508M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); |
37 | 508M | *round = _mm256_permute4x64_epi64(*round, 0x54); |
38 | 508M | if (log_scale > 0) { |
39 | 3.70M | const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); |
40 | 3.70M | *round = _mm256_add_epi16(*round, rnd); |
41 | 3.70M | *round = _mm256_srai_epi16(*round, log_scale); |
42 | 3.70M | } |
43 | | |
44 | 508M | *quant = |
45 | 508M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); |
46 | 508M | *quant = _mm256_permute4x64_epi64(*quant, 0x54); |
47 | 508M | *dequant = |
48 | 508M | _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); |
49 | 508M | *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); |
50 | 508M | *shift = _mm256_castsi128_si256( |
51 | 508M | _mm_load_si128((const __m128i *)mb_plane->quant_shift)); |
52 | 508M | *shift = _mm256_permute4x64_epi64(*shift, 0x54); |
53 | 508M | } |
54 | | |
55 | | static VPX_FORCE_INLINE __m256i |
56 | 1.19G | load_coefficients_avx2(const tran_low_t *coeff_ptr) { |
57 | 1.19G | #if CONFIG_VP9_HIGHBITDEPTH |
58 | | // typedef int32_t tran_low_t; |
59 | 1.19G | const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); |
60 | 1.19G | const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); |
61 | 1.19G | return _mm256_packs_epi32(coeff1, coeff2); |
62 | | #else |
63 | | // typedef int16_t tran_low_t; |
64 | | return _mm256_loadu_si256((const __m256i *)coeff_ptr); |
65 | | #endif |
66 | 1.19G | } |
67 | | |
68 | | static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, |
69 | 898M | tran_low_t *coeff_ptr) { |
70 | 898M | #if CONFIG_VP9_HIGHBITDEPTH |
71 | | // typedef int32_t tran_low_t; |
72 | 898M | __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); |
73 | 898M | __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); |
74 | 898M | __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); |
75 | 898M | _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); |
76 | 898M | _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); |
77 | | #else |
78 | | // typedef int16_t tran_low_t; |
79 | | _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); |
80 | | #endif |
81 | 898M | } |
82 | | |
83 | | static VPX_FORCE_INLINE __m256i |
84 | | quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, |
85 | | tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, |
86 | 959M | __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { |
87 | 959M | const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); |
88 | 959M | const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); |
89 | 959M | const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); |
90 | | |
91 | 959M | if (_mm256_movemask_epi8(v_zbin_mask) == 0) { |
92 | 212M | _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); |
93 | 212M | _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); |
94 | 212M | #if CONFIG_VP9_HIGHBITDEPTH |
95 | 212M | _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); |
96 | 212M | _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); |
97 | 212M | #endif // CONFIG_VP9_HIGHBITDEPTH |
98 | 212M | return _mm256_setzero_si256(); |
99 | 212M | } |
100 | 747M | { |
101 | | // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 |
102 | 747M | const __m256i v_tmp_rnd = |
103 | 747M | _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); |
104 | | |
105 | 747M | const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); |
106 | 747M | const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); |
107 | 747M | const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); |
108 | 747M | const __m256i v_nz_mask = |
109 | 747M | _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); |
110 | 747M | const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); |
111 | 747M | #if CONFIG_VP9_HIGHBITDEPTH |
112 | 747M | const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); |
113 | 747M | const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); |
114 | | |
115 | 747M | const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); |
116 | 747M | const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); |
117 | | #else |
118 | | const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); |
119 | | #endif |
120 | | |
121 | 747M | store_coefficients_avx2(v_qcoeff, qcoeff_ptr); |
122 | 747M | #if CONFIG_VP9_HIGHBITDEPTH |
123 | 747M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); |
124 | 747M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); |
125 | | #else |
126 | | store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); |
127 | | #endif |
128 | 747M | return v_nz_mask; |
129 | 959M | } |
130 | 959M | } |
131 | | |
132 | | static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, |
133 | | __m256i v_eobmax, |
134 | 1.11G | __m256i v_mask) { |
135 | 1.11G | #if CONFIG_VP9_HIGHBITDEPTH |
136 | 1.11G | const __m256i v_iscan = _mm256_permute4x64_epi64( |
137 | 1.11G | _mm256_loadu_si256((const __m256i *)iscan), 0xD8); |
138 | | #else |
139 | | const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); |
140 | | #endif |
141 | 1.11G | const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); |
142 | 1.11G | return _mm256_max_epi16(v_eobmax, v_nz_iscan); |
143 | 1.11G | } |
144 | | |
145 | 508M | static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { |
146 | 508M | const __m128i eob_lo = _mm256_castsi256_si128(eob256); |
147 | 508M | const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); |
148 | 508M | __m128i eob = _mm_max_epi16(eob_lo, eob_hi); |
149 | 508M | __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); |
150 | 508M | eob = _mm_max_epi16(eob, eob_shuffled); |
151 | 508M | eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); |
152 | 508M | eob = _mm_max_epi16(eob, eob_shuffled); |
153 | 508M | eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); |
154 | 508M | eob = _mm_max_epi16(eob, eob_shuffled); |
155 | 508M | return _mm_extract_epi16(eob, 1); |
156 | 508M | } |
157 | | |
158 | | void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, |
159 | | const struct macroblock_plane *const mb_plane, |
160 | | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
161 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
162 | 504M | const struct ScanOrder *const scan_order) { |
163 | 504M | __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; |
164 | 504M | __m256i v_eobmax = _mm256_setzero_si256(); |
165 | 504M | intptr_t count; |
166 | 504M | const int16_t *iscan = scan_order->iscan; |
167 | | |
168 | 504M | load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, |
169 | 504M | &v_dequant, &v_quant_shift, 0); |
170 | | // Do DC and first 15 AC. |
171 | 504M | v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, |
172 | 504M | &v_dequant, &v_round, &v_zbin, &v_quant_shift); |
173 | | |
174 | 504M | v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); |
175 | | |
176 | 504M | v_round = _mm256_unpackhi_epi64(v_round, v_round); |
177 | 504M | v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); |
178 | 504M | v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); |
179 | 504M | v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); |
180 | 504M | v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); |
181 | | |
182 | 959M | for (count = n_coeffs - 16; count > 0; count -= 16) { |
183 | 454M | coeff_ptr += 16; |
184 | 454M | qcoeff_ptr += 16; |
185 | 454M | dqcoeff_ptr += 16; |
186 | 454M | iscan += 16; |
187 | 454M | v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, |
188 | 454M | &v_dequant, &v_round, &v_zbin, &v_quant_shift); |
189 | | |
190 | 454M | v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); |
191 | 454M | } |
192 | | |
193 | 504M | *eob_ptr = accumulate_eob256(v_eobmax); |
194 | 504M | } |
195 | | |
196 | | static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( |
197 | | const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, |
198 | | tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, |
199 | | __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, |
200 | 236M | __m256i *v_quant_shift, __m256i *v_eobmax) { |
201 | 236M | const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); |
202 | 236M | const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); |
203 | 236M | const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); |
204 | | |
205 | 236M | if (_mm256_movemask_epi8(v_zbin_mask) == 0) { |
206 | 86.4M | _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); |
207 | 86.4M | _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); |
208 | 86.4M | #if CONFIG_VP9_HIGHBITDEPTH |
209 | 86.4M | _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); |
210 | 86.4M | _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); |
211 | 86.4M | #endif |
212 | 86.4M | return *v_eobmax; |
213 | 86.4M | } |
214 | 150M | { |
215 | | // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 |
216 | 150M | const __m256i v_tmp_rnd = |
217 | 150M | _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); |
218 | | // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * |
219 | | // quant_shift_ptr[rc != 0]) >> 15); |
220 | 150M | const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); |
221 | 150M | const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); |
222 | 150M | const __m256i v_tmp32_hi = |
223 | 150M | _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); |
224 | 150M | const __m256i v_tmp32_lo = |
225 | 150M | _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); |
226 | 150M | const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); |
227 | 150M | const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); |
228 | 150M | const __m256i v_sign_lo = |
229 | 150M | _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); |
230 | 150M | const __m256i v_sign_hi = |
231 | 150M | _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); |
232 | 150M | const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); |
233 | 150M | const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); |
234 | 150M | const __m256i v_dqcoeff_lo = _mm256_sign_epi32( |
235 | 150M | _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); |
236 | 150M | const __m256i v_dqcoeff_hi = _mm256_sign_epi32( |
237 | 150M | _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); |
238 | 150M | const __m256i v_nz_mask = |
239 | 150M | _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); |
240 | | |
241 | 150M | store_coefficients_avx2(v_qcoeff, qcoeff_ptr); |
242 | | |
243 | 150M | #if CONFIG_VP9_HIGHBITDEPTH |
244 | 150M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); |
245 | 150M | _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); |
246 | | #else |
247 | | store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), |
248 | | dqcoeff_ptr); |
249 | | #endif |
250 | | |
251 | 150M | return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); |
252 | 236M | } |
253 | 236M | } |
254 | | |
255 | | void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, |
256 | | const struct macroblock_plane *const mb_plane, |
257 | | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
258 | | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
259 | 3.70M | const struct ScanOrder *const scan_order) { |
260 | 3.70M | __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; |
261 | 3.70M | __m256i v_eobmax = _mm256_setzero_si256(); |
262 | 3.70M | intptr_t count; |
263 | 3.70M | const int16_t *iscan = scan_order->iscan; |
264 | | |
265 | 3.70M | load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, |
266 | 3.70M | &v_dequant, &v_quant_shift, 1); |
267 | | |
268 | | // Do DC and first 15 AC. |
269 | 3.70M | v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, |
270 | 3.70M | &v_quant, &v_dequant, &v_round, &v_zbin, |
271 | 3.70M | &v_quant_shift, &v_eobmax); |
272 | | |
273 | 3.70M | v_round = _mm256_unpackhi_epi64(v_round, v_round); |
274 | 3.70M | v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); |
275 | 3.70M | v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); |
276 | 3.70M | v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); |
277 | 3.70M | v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); |
278 | | |
279 | 236M | for (count = (32 * 32) - 16; count > 0; count -= 16) { |
280 | 233M | coeff_ptr += 16; |
281 | 233M | qcoeff_ptr += 16; |
282 | 233M | dqcoeff_ptr += 16; |
283 | 233M | iscan += 16; |
284 | 233M | v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, |
285 | 233M | &v_quant, &v_dequant, &v_round, &v_zbin, |
286 | 233M | &v_quant_shift, &v_eobmax); |
287 | 233M | } |
288 | | |
289 | 3.70M | *eob_ptr = accumulate_eob256(v_eobmax); |
290 | 3.70M | } |