/rust/registry/src/index.crates.io-1949cf8c6b5b557f/rav1e-0.8.1/src/quantize/mod.rs
Line | Count | Source |
1 | | // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved |
2 | | // |
3 | | // This source code is subject to the terms of the BSD 2 Clause License and |
4 | | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
5 | | // was not distributed with this source code in the LICENSE file, you can |
6 | | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
7 | | // Media Patent License 1.0 was not distributed with this source code in the |
8 | | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
9 | | |
10 | | #![allow(non_upper_case_globals)] |
11 | | |
12 | | mod tables; |
13 | | |
14 | | cfg_if::cfg_if! { |
15 | | if #[cfg(nasm_x86_64)] { |
16 | | pub use crate::asm::x86::quantize::*; |
17 | | } else { |
18 | | pub use self::rust::*; |
19 | | } |
20 | | } |
21 | | |
22 | | pub use tables::*; |
23 | | |
24 | | use crate::scan_order::av1_scan_orders; |
25 | | use crate::transform::{TxSize, TxType}; |
26 | | use crate::util::*; |
27 | | use std::mem; |
28 | | use std::num::{NonZeroU16, NonZeroU32, NonZeroU64}; |
29 | | |
30 | 0 | pub fn get_log_tx_scale(tx_size: TxSize) -> usize { |
31 | 0 | let num_pixels = tx_size.area(); |
32 | | |
33 | 0 | Into::<usize>::into(num_pixels > 256) |
34 | 0 | + Into::<usize>::into(num_pixels > 1024) |
35 | 0 | } |
36 | | |
37 | 0 | pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { |
38 | 0 | let dc_q: [&[NonZeroU16; 256]; 3] = |
39 | 0 | [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3]; |
40 | 0 | let bd = ((bit_depth ^ 8) >> 1).min(2); |
41 | 0 | dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] |
42 | 0 | } |
43 | | |
44 | 0 | pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { |
45 | 0 | let ac_q: [&[NonZeroU16; 256]; 3] = |
46 | 0 | [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3]; |
47 | 0 | let bd = ((bit_depth ^ 8) >> 1).min(2); |
48 | 0 | ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] |
49 | 0 | } |
50 | | |
51 | | // TODO: Handle lossless properly. |
52 | 0 | fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 { |
53 | 0 | if quantizer < qlookup[MINQ].get() as i64 { |
54 | 0 | MINQ as u8 |
55 | 0 | } else if quantizer >= qlookup[MAXQ].get() as i64 { |
56 | 0 | MAXQ as u8 |
57 | | } else { |
58 | 0 | match qlookup |
59 | 0 | .binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero")) |
60 | | { |
61 | 0 | Ok(qi) => qi as u8, |
62 | 0 | Err(qi) => { |
63 | 0 | debug_assert!(qi > MINQ); |
64 | 0 | debug_assert!(qi <= MAXQ); |
65 | | // Pick the closest quantizer in the log domain. |
66 | 0 | let qthresh = |
67 | 0 | (qlookup[qi - 1].get() as i32) * (qlookup[qi].get() as i32); |
68 | 0 | let q2_i32 = (quantizer as i32) * (quantizer as i32); |
69 | 0 | if q2_i32 < qthresh { |
70 | 0 | (qi - 1) as u8 |
71 | | } else { |
72 | 0 | qi as u8 |
73 | | } |
74 | | } |
75 | | } |
76 | | } |
77 | 0 | } |
78 | | |
79 | 0 | pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 { |
80 | 0 | let qlookup = match bit_depth { |
81 | 0 | 8 => &dc_qlookup_Q3, |
82 | 0 | 10 => &dc_qlookup_10_Q3, |
83 | 0 | 12 => &dc_qlookup_12_Q3, |
84 | 0 | _ => unimplemented!(), |
85 | | }; |
86 | 0 | select_qi(quantizer, qlookup) |
87 | 0 | } |
88 | | |
89 | 0 | pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 { |
90 | 0 | let qlookup = match bit_depth { |
91 | 0 | 8 => &ac_qlookup_Q3, |
92 | 0 | 10 => &ac_qlookup_10_Q3, |
93 | 0 | 12 => &ac_qlookup_12_Q3, |
94 | 0 | _ => unimplemented!(), |
95 | | }; |
96 | 0 | select_qi(quantizer, qlookup) |
97 | 0 | } |
98 | | |
99 | | #[derive(Debug, Clone, Copy)] |
100 | | pub struct QuantizationContext { |
101 | | log_tx_scale: usize, |
102 | | dc_quant: NonZeroU16, |
103 | | dc_offset: u32, |
104 | | dc_mul_add: (u32, u32, u32), |
105 | | |
106 | | ac_quant: NonZeroU16, |
107 | | ac_offset_eob: u32, |
108 | | ac_offset0: u32, |
109 | | ac_offset1: u32, |
110 | | ac_mul_add: (u32, u32, u32), |
111 | | } |
112 | | |
113 | | impl Default for QuantizationContext { |
114 | 0 | fn default() -> Self { |
115 | 0 | QuantizationContext { |
116 | 0 | dc_quant: NonZeroU16::new(1).expect("Not zero"), |
117 | 0 | ac_quant: NonZeroU16::new(1).expect("Not zero"), |
118 | 0 | log_tx_scale: Default::default(), |
119 | 0 | dc_offset: Default::default(), |
120 | 0 | dc_mul_add: Default::default(), |
121 | 0 | ac_offset_eob: Default::default(), |
122 | 0 | ac_offset0: Default::default(), |
123 | 0 | ac_offset1: Default::default(), |
124 | 0 | ac_mul_add: Default::default(), |
125 | 0 | } |
126 | 0 | } |
127 | | } |
128 | | |
129 | 0 | fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) { |
130 | 0 | let nbits = (mem::size_of_val(&d) as u64) * 8; |
131 | 0 | let m = nbits - d.leading_zeros() as u64 - 1; |
132 | 0 | if d.is_power_of_two() { |
133 | 0 | (0xFFFF_FFFF, 0xFFFF_FFFF, m as u32) |
134 | | } else { |
135 | 0 | let d = NonZeroU64::from(d); |
136 | 0 | let t = (1u64 << (m + nbits)) / d; |
137 | | |
138 | 0 | let d = d.get(); |
139 | 0 | let r = (t * d + d) & ((1 << nbits) - 1); |
140 | 0 | if r <= 1u64 << m { |
141 | 0 | (t as u32 + 1, 0u32, m as u32) |
142 | | } else { |
143 | 0 | (t as u32, t as u32, m as u32) |
144 | | } |
145 | | } |
146 | 0 | } |
147 | | |
148 | | #[inline] |
149 | 0 | const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 { |
150 | 0 | let x = x as u64; |
151 | 0 | let (a, b, shift) = d; |
152 | 0 | let shift = shift as u64; |
153 | 0 | let a = a as u64; |
154 | 0 | let b = b as u64; |
155 | | |
156 | 0 | (((a * x + b) >> 32) >> shift) as u32 |
157 | 0 | } Unexecuted instantiation: rav1e::quantize::divu_pair Unexecuted instantiation: rav1e::quantize::divu_pair |
158 | | |
159 | | #[inline] |
160 | 0 | const fn copysign(value: u32, signed: i32) -> i32 { |
161 | 0 | if signed < 0 { |
162 | 0 | -(value as i32) |
163 | | } else { |
164 | 0 | value as i32 |
165 | | } |
166 | 0 | } Unexecuted instantiation: rav1e::quantize::copysign Unexecuted instantiation: rav1e::quantize::copysign |
167 | | |
168 | | #[cfg(test)] |
169 | | mod test { |
170 | | use super::*; |
171 | | use crate::transform::TxSize::*; |
172 | | |
173 | | #[test] |
174 | | fn test_divu_pair() { |
175 | | for d in 1..1024 { |
176 | | for x in 0..1000 { |
177 | | let ab = divu_gen(NonZeroU32::new(d).unwrap()); |
178 | | assert_eq!(x / d, divu_pair(x, ab)); |
179 | | } |
180 | | } |
181 | | } |
182 | | #[test] |
183 | | fn gen_divu_table() { |
184 | | let b: Vec<(u32, u32, u32)> = |
185 | | dc_qlookup_Q3.iter().map(|&v| divu_gen(v.into())).collect(); |
186 | | |
187 | | println!("{:?}", b); |
188 | | } |
189 | | #[test] |
190 | | fn test_tx_log_scale() { |
191 | | let tx_sizes = [ |
192 | | (TX_4X4, 0), |
193 | | (TX_8X8, 0), |
194 | | (TX_16X16, 0), |
195 | | (TX_32X32, 1), |
196 | | (TX_64X64, 2), |
197 | | (TX_4X8, 0), |
198 | | (TX_8X4, 0), |
199 | | (TX_8X16, 0), |
200 | | (TX_16X8, 0), |
201 | | (TX_16X32, 1), |
202 | | (TX_32X16, 1), |
203 | | (TX_32X64, 2), |
204 | | (TX_64X32, 2), |
205 | | (TX_4X16, 0), |
206 | | (TX_16X4, 0), |
207 | | (TX_8X32, 0), |
208 | | (TX_32X8, 0), |
209 | | (TX_16X64, 1), |
210 | | (TX_64X16, 1), |
211 | | ]; |
212 | | for &tx_size in tx_sizes.iter() { |
213 | | assert!(tx_size.1 == get_log_tx_scale(tx_size.0)); |
214 | | } |
215 | | } |
216 | | } |
217 | | |
218 | | impl QuantizationContext { |
219 | 0 | pub fn update( |
220 | 0 | &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize, |
221 | 0 | dc_delta_q: i8, ac_delta_q: i8, |
222 | 0 | ) { |
223 | 0 | self.log_tx_scale = get_log_tx_scale(tx_size); |
224 | | |
225 | 0 | self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth); |
226 | 0 | self.dc_mul_add = divu_gen(self.dc_quant.into()); |
227 | | |
228 | 0 | self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth); |
229 | 0 | self.ac_mul_add = divu_gen(self.ac_quant.into()); |
230 | | |
231 | | // All of these biases were derived by measuring the cost of coding |
232 | | // a zero vs coding a one on any given coefficient position, or, in |
233 | | // the case of the EOB bias, the cost of coding the block with |
234 | | // the chosen EOB (rounding to one) vs rounding to zero and continuing |
235 | | // to choose a new EOB. This was done over several clips, with the |
236 | | // average of the bit costs taken over all blocks in the set, and a new |
237 | | // bias derived via the method outlined in Jean-Marc Valin's |
238 | | // Journal of Dubious Theoretical Results[1], aka: |
239 | | // |
240 | | // lambda = ln(2) / 6.0 |
241 | | // threshold = 0.5 + (lambda * avg_rate_diff) / 2.0 |
242 | | // bias = 1 - threshold |
243 | | // |
244 | | // lambda is a constant since our offsets are already adjusted for the |
245 | | // quantizer. |
246 | | // |
247 | | // Biases were then updated, and cost collection was re-run, until |
248 | | // the calculated biases started to converge after 2-4 iterations. |
249 | | // |
250 | | // In theory, the rounding biases for inter should be somewhat smaller |
251 | | // than the biases for intra, but this turns out to only be the case |
252 | | // for EOB optimization, or at least, is covered by EOB optimization. |
253 | | // The RD-optimal rounding biases for the actual coefficients seem |
254 | | // to be quite close (+/- 1/256), for both inter and intra, |
255 | | // post-deadzoning. |
256 | | // |
257 | | // [1] https://jmvalin.ca/notes/theoretical_results.pdf |
258 | | self.dc_offset = |
259 | 0 | self.dc_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; |
260 | | self.ac_offset0 = |
261 | 0 | self.ac_quant.get() as u32 * (if is_intra { 98 } else { 97 }) / 256; |
262 | | self.ac_offset1 = |
263 | 0 | self.ac_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; |
264 | | self.ac_offset_eob = |
265 | 0 | self.ac_quant.get() as u32 * (if is_intra { 88 } else { 44 }) / 256; |
266 | 0 | } |
267 | | |
268 | | #[inline] |
269 | 0 | pub fn quantize<T: Coefficient>( |
270 | 0 | &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType, |
271 | 0 | ) -> u16 { |
272 | 0 | let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan; |
273 | 0 | let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan; |
274 | | |
275 | 0 | qcoeffs[0] = { |
276 | 0 | let coeff: i32 = i32::cast_from(coeffs[0]) << self.log_tx_scale; |
277 | 0 | let abs_coeff = coeff.unsigned_abs(); |
278 | 0 | T::cast_from(copysign( |
279 | 0 | divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add), |
280 | 0 | coeff, |
281 | 0 | )) |
282 | 0 | }; |
283 | | |
284 | | // Find the last non-zero coefficient using our smaller biases and |
285 | | // zero everything else. |
286 | | // This threshold is such that `abs(coeff) < deadzone` implies: |
287 | | // (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0 |
288 | 0 | let deadzone = T::cast_from( |
289 | 0 | (self.ac_quant.get() as usize - self.ac_offset_eob as usize) |
290 | 0 | .align_power_of_two_and_shift(self.log_tx_scale), |
291 | | ); |
292 | 0 | let eob = { |
293 | 0 | let eob_minus_one = iscan |
294 | 0 | .iter() |
295 | 0 | .zip(coeffs) |
296 | 0 | .map(|(&i, &c)| if c.abs() >= deadzone { i } else { 0 })Unexecuted instantiation: <rav1e::quantize::QuantizationContext>::quantize::<i32>::{closure#0}Unexecuted instantiation: <rav1e::quantize::QuantizationContext>::quantize::<i16>::{closure#0} |
297 | 0 | .max() |
298 | 0 | .unwrap_or(0); |
299 | | // We skip the DC coefficient since it has its own quantizer index. |
300 | 0 | if eob_minus_one > 0 { |
301 | 0 | eob_minus_one + 1 |
302 | | } else { |
303 | 0 | u16::from(qcoeffs[0] != T::cast_from(0)) |
304 | | } |
305 | | }; |
306 | | |
307 | | // Here we use different rounding biases depending on whether we've |
308 | | // had recent coefficients that are larger than one, or less than |
309 | | // one. The reason for this is that a block usually has a chunk of |
310 | | // large coefficients and a tail of zeroes and ones, and the tradeoffs |
311 | | // for coding these two are different. In the tail of zeroes and ones, |
312 | | // you'll likely end up spending most bits just saying where that |
313 | | // coefficient is in the block, whereas in the chunk of larger |
314 | | // coefficients, most bits will be spent on coding its magnitude. |
315 | | // To that end, we want to bias more toward rounding to zero for |
316 | | // that tail of zeroes and ones than we do for the larger coefficients. |
317 | 0 | let mut level_mode = 1; |
318 | 0 | let ac_quant = self.ac_quant.get() as u32; |
319 | 0 | for &pos in scan.iter().take(usize::from(eob)).skip(1) { |
320 | 0 | let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale; |
321 | 0 | let abs_coeff = coeff.unsigned_abs(); |
322 | | |
323 | 0 | let level0 = divu_pair(abs_coeff, self.ac_mul_add); |
324 | 0 | let offset = if level0 > 1 - level_mode { |
325 | 0 | self.ac_offset1 |
326 | | } else { |
327 | 0 | self.ac_offset0 |
328 | | }; |
329 | | |
330 | 0 | let abs_qcoeff: u32 = |
331 | 0 | level0 + (abs_coeff + offset >= (level0 + 1) * ac_quant) as u32; |
332 | 0 | if level_mode != 0 && abs_qcoeff == 0 { |
333 | 0 | level_mode = 0; |
334 | 0 | } else if abs_qcoeff > 1 { |
335 | 0 | level_mode = 1; |
336 | 0 | } |
337 | | |
338 | 0 | qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff)); |
339 | | } |
340 | | |
341 | | // Rather than zeroing the tail in scan order, assume that qcoeffs is |
342 | | // pre-filled with zeros. |
343 | | |
344 | | // Check the eob is correct |
345 | 0 | debug_assert_eq!( |
346 | 0 | usize::from(eob), |
347 | 0 | scan |
348 | 0 | .iter() |
349 | 0 | .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0)) |
350 | 0 | .map(|n| n + 1) |
351 | 0 | .unwrap_or(0) |
352 | | ); |
353 | | |
354 | 0 | eob |
355 | 0 | } Unexecuted instantiation: <rav1e::quantize::QuantizationContext>::quantize::<i32> Unexecuted instantiation: <rav1e::quantize::QuantizationContext>::quantize::<i16> |
356 | | } |
357 | | |
358 | | pub mod rust { |
359 | | use super::*; |
360 | | use crate::cpu_features::CpuFeatureLevel; |
361 | | use std::mem::MaybeUninit; |
362 | | |
363 | 0 | pub fn dequantize<T: Coefficient>( |
364 | 0 | qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>], |
365 | 0 | tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, |
366 | 0 | _cpu: CpuFeatureLevel, |
367 | 0 | ) { |
368 | 0 | let log_tx_scale = get_log_tx_scale(tx_size) as i32; |
369 | 0 | let offset = (1 << log_tx_scale) - 1; |
370 | | |
371 | 0 | let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32; |
372 | 0 | let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32; |
373 | | |
374 | 0 | for (i, (r, c)) in rcoeffs |
375 | 0 | .iter_mut() |
376 | 0 | .zip(coeffs.iter().map(|&c| i32::cast_from(c))) Unexecuted instantiation: rav1e::quantize::rust::dequantize::<i32>::{closure#0}Unexecuted instantiation: rav1e::quantize::rust::dequantize::<i16>::{closure#0} |
377 | 0 | .enumerate() |
378 | | { |
379 | 0 | let quant = if i == 0 { dc_quant } else { ac_quant }; |
380 | 0 | r.write(T::cast_from( |
381 | 0 | (c * quant + ((c >> 31) & offset)) >> log_tx_scale, |
382 | | )); |
383 | | } |
384 | 0 | } Unexecuted instantiation: rav1e::quantize::rust::dequantize::<i32> Unexecuted instantiation: rav1e::quantize::rust::dequantize::<i16> |
385 | | } |