/src/aom/av1/encoder/nonrd_opt.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "config/aom_dsp_rtcd.h" |
13 | | #include "config/av1_rtcd.h" |
14 | | |
15 | | #include "av1/common/reconinter.h" |
16 | | |
17 | | #include "av1/encoder/encodemv.h" |
18 | | #include "av1/encoder/nonrd_opt.h" |
19 | | #include "av1/encoder/rdopt.h" |
20 | | |
21 | | static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = { |
22 | | av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16 |
23 | | }; |
24 | | |
25 | | #define DECLARE_BLOCK_YRD_BUFFERS() \ |
26 | 0 | DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \ |
27 | 0 | DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]); \ |
28 | 0 | DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]); \ |
29 | 0 | uint16_t eob[1]; |
30 | | |
31 | | #define DECLARE_BLOCK_YRD_VARS() \ |
32 | | /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the \ |
33 | | * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \ |
34 | | * as a non-const so we can reassign it to macroblock_plane::coeff. */ \ |
35 | 0 | int16_t *low_coeff = (int16_t *)coeff_buf; \ |
36 | 0 | int16_t *const low_qcoeff = (int16_t *)qcoeff_buf; \ |
37 | 0 | int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf; \ |
38 | 0 | const int diff_stride = bw; |
39 | | |
40 | | #define DECLARE_LOOP_VARS_BLOCK_YRD() \ |
41 | 0 | const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2]; |
42 | | |
43 | | static AOM_FORCE_INLINE void update_yrd_loop_vars( |
44 | | MACROBLOCK *x, int *skippable, int step, int ncoeffs, |
45 | | int16_t *const low_coeff, int16_t *const low_qcoeff, |
46 | | int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost, |
47 | 0 | int tx_blk_id) { |
48 | 0 | const int is_txfm_skip = (ncoeffs == 0); |
49 | 0 | *skippable &= is_txfm_skip; |
50 | 0 | x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; |
51 | 0 | *eob_cost += get_msb(ncoeffs + 1); |
52 | 0 | if (ncoeffs == 1) |
53 | 0 | this_rdc->rate += (int)abs(low_qcoeff[0]); |
54 | 0 | else if (ncoeffs > 1) |
55 | 0 | this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4); |
56 | |
|
57 | 0 | this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2; |
58 | 0 | } |
59 | | |
60 | | static inline void aom_process_hadamard_lp_8x16(MACROBLOCK *x, |
61 | | int max_blocks_high, |
62 | | int max_blocks_wide, |
63 | | int num_4x4_w, int step, |
64 | 0 | int block_step) { |
65 | 0 | struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; |
66 | 0 | const int bw = 4 * num_4x4_w; |
67 | 0 | const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide); |
68 | 0 | int block = 0; |
69 | |
|
70 | 0 | for (int r = 0; r < max_blocks_high; r += block_step) { |
71 | 0 | for (int c = 0; c < num_4x4; c += 2 * block_step) { |
72 | 0 | const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2]; |
73 | 0 | int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block); |
74 | 0 | aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff); |
75 | 0 | block += 2 * step; |
76 | 0 | } |
77 | 0 | } |
78 | 0 | } |
79 | | |
80 | | #if CONFIG_AV1_HIGHBITDEPTH |
81 | | #define DECLARE_BLOCK_YRD_HBD_VARS() \ |
82 | 0 | tran_low_t *const coeff = coeff_buf; \ |
83 | 0 | tran_low_t *const qcoeff = qcoeff_buf; \ |
84 | 0 | tran_low_t *const dqcoeff = dqcoeff_buf; |
85 | | |
86 | | static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd( |
87 | | MACROBLOCK *x, int *skippable, int step, int ncoeffs, |
88 | | tran_low_t *const coeff, tran_low_t *const qcoeff, |
89 | | tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost, |
90 | 0 | int tx_blk_id) { |
91 | 0 | const MACROBLOCKD *xd = &x->e_mbd; |
92 | 0 | const int is_txfm_skip = (ncoeffs == 0); |
93 | 0 | *skippable &= is_txfm_skip; |
94 | 0 | x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; |
95 | 0 | *eob_cost += get_msb(ncoeffs + 1); |
96 | |
|
97 | 0 | int64_t dummy; |
98 | 0 | if (ncoeffs == 1) |
99 | 0 | this_rdc->rate += (int)abs(qcoeff[0]); |
100 | 0 | else if (ncoeffs > 1) |
101 | 0 | this_rdc->rate += aom_satd(qcoeff, step << 4); |
102 | 0 | this_rdc->dist += |
103 | 0 | av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2; |
104 | 0 | } |
105 | | #endif |
106 | | |
107 | | /*!\brief Calculates RD Cost using Hadamard transform. |
108 | | * |
109 | | * \ingroup nonrd_mode_search |
110 | | * \callgraph |
111 | | * \callergraph |
112 | | * Calculates RD Cost using Hadamard transform. For low bit depth this function |
113 | | * uses low-precision set of functions (16-bit) and 32 bit for high bit depth |
114 | | * \param[in] x Pointer to structure holding all the data for |
115 | | the current macroblock |
116 | | * \param[in] this_rdc Pointer to calculated RD Cost |
117 | | * \param[in] skippable Pointer to a flag indicating possible tx skip |
118 | | * \param[in] bsize Current block size |
119 | | * \param[in] tx_size Transform size |
120 | | * \param[in] is_inter_mode Flag to indicate inter mode |
121 | | * |
122 | | * \remark Nothing is returned. Instead, calculated RD cost is placed to |
123 | | * \c this_rdc. \c skippable flag is set if there is no non-zero quantized |
124 | | * coefficients for Hadamard transform |
125 | | */ |
126 | | void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, |
127 | 0 | BLOCK_SIZE bsize, TX_SIZE tx_size) { |
128 | 0 | MACROBLOCKD *xd = &x->e_mbd; |
129 | 0 | const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y]; |
130 | 0 | struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; |
131 | 0 | assert(bsize < BLOCK_SIZES_ALL); |
132 | 0 | const int num_4x4_w = mi_size_wide[bsize]; |
133 | 0 | const int num_4x4_h = mi_size_high[bsize]; |
134 | 0 | const int step = 1 << (tx_size << 1); |
135 | 0 | const int block_step = (1 << tx_size); |
136 | 0 | const int row_step = step * num_4x4_w >> tx_size; |
137 | 0 | int block = 0; |
138 | 0 | const int max_blocks_wide = |
139 | 0 | num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); |
140 | 0 | const int max_blocks_high = |
141 | 0 | num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); |
142 | 0 | int eob_cost = 0; |
143 | 0 | const int bw = 4 * num_4x4_w; |
144 | 0 | const int bh = 4 * num_4x4_h; |
145 | 0 | const int use_hbd = is_cur_buf_hbd(xd); |
146 | 0 | int num_blk_skip_w = num_4x4_w; |
147 | |
|
148 | 0 | #if CONFIG_AV1_HIGHBITDEPTH |
149 | 0 | if (use_hbd) { |
150 | 0 | aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, |
151 | 0 | p->src.stride, pd->dst.buf, pd->dst.stride); |
152 | 0 | } else { |
153 | 0 | aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, |
154 | 0 | pd->dst.buf, pd->dst.stride); |
155 | 0 | } |
156 | | #else |
157 | | aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, |
158 | | pd->dst.buf, pd->dst.stride); |
159 | | #endif |
160 | | |
161 | | // Keep the intermediate value on the stack here. Writing directly to |
162 | | // skippable causes speed regression due to load-and-store issues in |
163 | | // update_yrd_loop_vars. |
164 | 0 | int temp_skippable = 1; |
165 | 0 | this_rdc->dist = 0; |
166 | 0 | this_rdc->rate = 0; |
167 | | // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks |
168 | | // can be done per function call. Hence the call of Hadamard txfm is |
169 | | // abstracted here for the specified cases. |
170 | 0 | int is_tx_8x8_dual_applicable = |
171 | 0 | (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 && |
172 | 0 | block_size_high[bsize] >= 8); |
173 | |
|
174 | 0 | #if CONFIG_AV1_HIGHBITDEPTH |
175 | | // As of now, dual implementation of hadamard txfm is available for low |
176 | | // bitdepth. |
177 | 0 | if (use_hbd) is_tx_8x8_dual_applicable = 0; |
178 | 0 | #endif |
179 | |
|
180 | 0 | if (is_tx_8x8_dual_applicable) { |
181 | 0 | aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w, |
182 | 0 | step, block_step); |
183 | 0 | } |
184 | |
|
185 | 0 | const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; |
186 | 0 | DECLARE_BLOCK_YRD_BUFFERS() |
187 | 0 | DECLARE_BLOCK_YRD_VARS() |
188 | 0 | #if CONFIG_AV1_HIGHBITDEPTH |
189 | 0 | DECLARE_BLOCK_YRD_HBD_VARS() |
190 | | #else |
191 | | (void)use_hbd; |
192 | | #endif |
193 | | |
194 | | // Keep track of the row and column of the blocks we use so that we know |
195 | | // if we are in the unrestricted motion border. |
196 | 0 | for (int r = 0; r < max_blocks_high; r += block_step) { |
197 | 0 | for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { |
198 | 0 | DECLARE_LOOP_VARS_BLOCK_YRD() |
199 | |
|
200 | 0 | switch (tx_size) { |
201 | 0 | #if CONFIG_AV1_HIGHBITDEPTH |
202 | 0 | case TX_16X16: |
203 | 0 | if (use_hbd) { |
204 | 0 | aom_hadamard_16x16(src_diff, diff_stride, coeff); |
205 | 0 | av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX, |
206 | 0 | p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, |
207 | 0 | dqcoeff, p->dequant_QTX, eob, |
208 | | // default_scan_fp_16x16_transpose and |
209 | | // av1_default_iscan_fp_16x16_transpose have to be |
210 | | // used together. |
211 | 0 | default_scan_fp_16x16_transpose, |
212 | 0 | av1_default_iscan_fp_16x16_transpose); |
213 | 0 | } else { |
214 | 0 | aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); |
215 | 0 | av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, |
216 | 0 | p->quant_fp_QTX, low_qcoeff, low_dqcoeff, |
217 | 0 | p->dequant_QTX, eob, |
218 | | // default_scan_lp_16x16_transpose and |
219 | | // av1_default_iscan_lp_16x16_transpose have to be |
220 | | // used together. |
221 | 0 | default_scan_lp_16x16_transpose, |
222 | 0 | av1_default_iscan_lp_16x16_transpose); |
223 | 0 | } |
224 | 0 | break; |
225 | 0 | case TX_8X8: |
226 | 0 | if (use_hbd) { |
227 | 0 | aom_hadamard_8x8(src_diff, diff_stride, coeff); |
228 | 0 | av1_quantize_fp( |
229 | 0 | coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, |
230 | 0 | p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, |
231 | 0 | default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); |
232 | 0 | } else { |
233 | 0 | if (is_tx_8x8_dual_applicable) { |
234 | | // The coeffs are pre-computed for the whole block, so re-assign |
235 | | // low_coeff to the appropriate location. |
236 | 0 | const int block_offset = BLOCK_OFFSET(block + s); |
237 | 0 | low_coeff = (int16_t *)p->coeff + block_offset; |
238 | 0 | } else { |
239 | 0 | aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); |
240 | 0 | } |
241 | 0 | av1_quantize_lp( |
242 | 0 | low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, |
243 | 0 | low_dqcoeff, p->dequant_QTX, eob, |
244 | | // default_scan_8x8_transpose and |
245 | | // av1_default_iscan_8x8_transpose have to be used together. |
246 | 0 | default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); |
247 | 0 | } |
248 | 0 | break; |
249 | 0 | default: |
250 | 0 | assert(tx_size == TX_4X4); |
251 | | // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate |
252 | | // normal coefficients order, so we don't need to change the scan |
253 | | // order here. |
254 | 0 | if (use_hbd) { |
255 | 0 | aom_fdct4x4(src_diff, coeff, diff_stride); |
256 | 0 | av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX, |
257 | 0 | p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, |
258 | 0 | dqcoeff, p->dequant_QTX, eob, scan_order->scan, |
259 | 0 | scan_order->iscan); |
260 | 0 | } else { |
261 | 0 | aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); |
262 | 0 | av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, |
263 | 0 | low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, |
264 | 0 | scan_order->scan, scan_order->iscan); |
265 | 0 | } |
266 | 0 | break; |
267 | | #else |
268 | | case TX_16X16: |
269 | | aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); |
270 | | av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX, |
271 | | low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, |
272 | | default_scan_lp_16x16_transpose, |
273 | | av1_default_iscan_lp_16x16_transpose); |
274 | | break; |
275 | | case TX_8X8: |
276 | | if (is_tx_8x8_dual_applicable) { |
277 | | // The coeffs are pre-computed for the whole block, so re-assign |
278 | | // low_coeff to the appropriate location. |
279 | | const int block_offset = BLOCK_OFFSET(block + s); |
280 | | low_coeff = (int16_t *)p->coeff + block_offset; |
281 | | } else { |
282 | | aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); |
283 | | } |
284 | | av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, |
285 | | low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, |
286 | | default_scan_8x8_transpose, |
287 | | av1_default_iscan_8x8_transpose); |
288 | | break; |
289 | | default: |
290 | | aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); |
291 | | av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, |
292 | | low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, |
293 | | scan_order->scan, scan_order->iscan); |
294 | | break; |
295 | | #endif |
296 | 0 | } |
297 | 0 | assert(*eob <= 1024); |
298 | 0 | #if CONFIG_AV1_HIGHBITDEPTH |
299 | 0 | if (use_hbd) |
300 | 0 | update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff, |
301 | 0 | dqcoeff, this_rdc, &eob_cost, |
302 | 0 | r * num_blk_skip_w + c); |
303 | 0 | else |
304 | 0 | #endif |
305 | 0 | update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, |
306 | 0 | low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, |
307 | 0 | r * num_blk_skip_w + c); |
308 | 0 | } |
309 | 0 | block += row_step; |
310 | 0 | } |
311 | | |
312 | 0 | this_rdc->skip_txfm = *skippable = temp_skippable; |
313 | 0 | if (this_rdc->sse < INT64_MAX) { |
314 | 0 | this_rdc->sse = (this_rdc->sse << 6) >> 2; |
315 | 0 | if (temp_skippable) { |
316 | 0 | this_rdc->dist = 0; |
317 | 0 | this_rdc->dist = this_rdc->sse; |
318 | 0 | return; |
319 | 0 | } |
320 | 0 | } |
321 | | |
322 | | // If skippable is set, rate gets clobbered later. |
323 | 0 | this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); |
324 | 0 | this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); |
325 | 0 | } |
326 | | |
327 | | // Explicitly enumerate the cases so the compiler can generate SIMD for the |
328 | | // function. According to the disassembler, gcc generates SSE codes for each of |
329 | | // the possible block sizes. The hottest case is tx_width 16, which takes up |
330 | | // about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since |
331 | | // av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the |
332 | | // potential room of improvement for writing AVX2 optimization is only 3% * 8% = |
333 | | // 0.24% of total encoding time. |
334 | | static inline void scale_square_buf_vals(int16_t *dst, int tx_width, |
335 | 0 | const int16_t *src, int src_stride) { |
336 | 0 | #define DO_SCALING \ |
337 | 0 | do { \ |
338 | 0 | for (int idy = 0; idy < tx_width; ++idy) { \ |
339 | 0 | for (int idx = 0; idx < tx_width; ++idx) { \ |
340 | 0 | dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \ |
341 | 0 | } \ |
342 | 0 | } \ |
343 | 0 | } while (0) |
344 | |
|
345 | 0 | if (tx_width == 4) { |
346 | 0 | DO_SCALING; |
347 | 0 | } else if (tx_width == 8) { |
348 | 0 | DO_SCALING; |
349 | 0 | } else if (tx_width == 16) { |
350 | 0 | DO_SCALING; |
351 | 0 | } else { |
352 | 0 | assert(0); |
353 | 0 | } |
354 | |
|
355 | 0 | #undef DO_SCALING |
356 | 0 | } |
357 | | |
358 | | /*!\brief Calculates RD Cost when the block uses Identity transform. |
359 | | * Note that this function is only for low bit depth encoding, since it |
360 | | * is called in real-time mode for now, which sets high bit depth to 0: |
361 | | * -DCONFIG_AV1_HIGHBITDEPTH=0 |
362 | | * |
363 | | * \ingroup nonrd_mode_search |
364 | | * \callgraph |
365 | | * \callergraph |
366 | | * Calculates RD Cost. For low bit depth this function |
367 | | * uses low-precision set of functions (16-bit) and 32 bit for high bit depth |
368 | | * \param[in] x Pointer to structure holding all the data for |
369 | | the current macroblock |
370 | | * \param[in] pred_buf Pointer to the prediction buffer |
371 | | * \param[in] pred_stride Stride for the prediction buffer |
372 | | * \param[in] this_rdc Pointer to calculated RD Cost |
373 | | * \param[in] skippable Pointer to a flag indicating possible tx skip |
374 | | * \param[in] bsize Current block size |
375 | | * \param[in] tx_size Transform size |
376 | | * |
377 | | * \remark Nothing is returned. Instead, calculated RD cost is placed to |
378 | | * \c this_rdc. \c skippable flag is set if all coefficients are zero. |
379 | | */ |
380 | | void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, |
381 | | int pred_stride, RD_STATS *this_rdc, int *skippable, |
382 | 0 | BLOCK_SIZE bsize, TX_SIZE tx_size) { |
383 | 0 | MACROBLOCKD *xd = &x->e_mbd; |
384 | 0 | struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; |
385 | 0 | assert(bsize < BLOCK_SIZES_ALL); |
386 | 0 | const int num_4x4_w = mi_size_wide[bsize]; |
387 | 0 | const int num_4x4_h = mi_size_high[bsize]; |
388 | 0 | const int step = 1 << (tx_size << 1); |
389 | 0 | const int block_step = (1 << tx_size); |
390 | 0 | const int max_blocks_wide = |
391 | 0 | num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); |
392 | 0 | const int max_blocks_high = |
393 | 0 | num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); |
394 | 0 | int eob_cost = 0; |
395 | 0 | const int bw = 4 * num_4x4_w; |
396 | 0 | const int bh = 4 * num_4x4_h; |
397 | 0 | const int num_blk_skip_w = num_4x4_w; |
398 | | // Keep the intermediate value on the stack here. Writing directly to |
399 | | // skippable causes speed regression due to load-and-store issues in |
400 | | // update_yrd_loop_vars. |
401 | 0 | int temp_skippable = 1; |
402 | 0 | int tx_wd = 0; |
403 | 0 | const SCAN_ORDER *scan_order = NULL; |
404 | 0 | switch (tx_size) { |
405 | 0 | case TX_64X64: |
406 | 0 | assert(0); // Not implemented |
407 | 0 | break; |
408 | 0 | case TX_32X32: |
409 | 0 | assert(0); // Not used |
410 | 0 | break; |
411 | 0 | case TX_16X16: |
412 | 0 | scan_order = &av1_fast_idtx_scan_order_16x16; |
413 | 0 | tx_wd = 16; |
414 | 0 | break; |
415 | 0 | case TX_8X8: |
416 | 0 | scan_order = &av1_fast_idtx_scan_order_8x8; |
417 | 0 | tx_wd = 8; |
418 | 0 | break; |
419 | 0 | default: |
420 | 0 | assert(tx_size == TX_4X4); |
421 | 0 | scan_order = &av1_fast_idtx_scan_order_4x4; |
422 | 0 | tx_wd = 4; |
423 | 0 | break; |
424 | 0 | } |
425 | 0 | assert(scan_order != NULL); |
426 | |
|
427 | 0 | this_rdc->dist = 0; |
428 | 0 | this_rdc->rate = 0; |
429 | 0 | aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, |
430 | 0 | pred_buf, pred_stride); |
431 | | // Keep track of the row and column of the blocks we use so that we know |
432 | | // if we are in the unrestricted motion border. |
433 | 0 | DECLARE_BLOCK_YRD_BUFFERS() |
434 | 0 | DECLARE_BLOCK_YRD_VARS() |
435 | 0 | for (int r = 0; r < max_blocks_high; r += block_step) { |
436 | 0 | for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { |
437 | 0 | DECLARE_LOOP_VARS_BLOCK_YRD() |
438 | 0 | scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride); |
439 | 0 | av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX, |
440 | 0 | p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, |
441 | 0 | eob, scan_order->scan, scan_order->iscan); |
442 | 0 | assert(*eob <= 1024); |
443 | 0 | update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, |
444 | 0 | low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, |
445 | 0 | r * num_blk_skip_w + c); |
446 | 0 | } |
447 | 0 | } |
448 | 0 | this_rdc->skip_txfm = *skippable = temp_skippable; |
449 | 0 | if (this_rdc->sse < INT64_MAX) { |
450 | 0 | this_rdc->sse = (this_rdc->sse << 6) >> 2; |
451 | 0 | if (temp_skippable) { |
452 | 0 | this_rdc->dist = 0; |
453 | 0 | this_rdc->dist = this_rdc->sse; |
454 | 0 | return; |
455 | 0 | } |
456 | 0 | } |
457 | | // If skippable is set, rate gets clobbered later. |
458 | 0 | this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); |
459 | 0 | this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); |
460 | 0 | } |
461 | | |
462 | | int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, |
463 | | MACROBLOCK *x, MACROBLOCKD *xd, |
464 | | RD_STATS *this_rdc, int start_plane, |
465 | 0 | int stop_plane) { |
466 | | // Note our transform coeffs are 8 times an orthogonal transform. |
467 | | // Hence quantizer step is also 8 times. To get effective quantizer |
468 | | // we need to divide by 8 before sending to modeling function. |
469 | 0 | unsigned int sse; |
470 | 0 | int rate; |
471 | 0 | int64_t dist; |
472 | 0 | int plane; |
473 | 0 | int64_t tot_sse = 0; |
474 | |
|
475 | 0 | this_rdc->rate = 0; |
476 | 0 | this_rdc->dist = 0; |
477 | 0 | this_rdc->skip_txfm = 0; |
478 | |
|
479 | 0 | for (plane = start_plane; plane <= stop_plane; ++plane) { |
480 | 0 | struct macroblock_plane *const p = &x->plane[plane]; |
481 | 0 | struct macroblockd_plane *const pd = &xd->plane[plane]; |
482 | 0 | const uint32_t dc_quant = p->dequant_QTX[0]; |
483 | 0 | const uint32_t ac_quant = p->dequant_QTX[1]; |
484 | 0 | const BLOCK_SIZE bs = plane_bsize; |
485 | 0 | unsigned int var; |
486 | 0 | if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue; |
487 | | |
488 | 0 | var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, |
489 | 0 | pd->dst.stride, &sse); |
490 | 0 | assert(sse >= var); |
491 | 0 | tot_sse += sse; |
492 | |
|
493 | 0 | av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], |
494 | 0 | dc_quant >> 3, &rate, &dist); |
495 | |
|
496 | 0 | this_rdc->rate += rate >> 1; |
497 | 0 | this_rdc->dist += dist << 3; |
498 | |
|
499 | 0 | av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3, |
500 | 0 | &rate, &dist); |
501 | |
|
502 | 0 | this_rdc->rate += rate; |
503 | 0 | this_rdc->dist += dist << 4; |
504 | 0 | } |
505 | |
|
506 | 0 | if (this_rdc->rate == 0) { |
507 | 0 | this_rdc->skip_txfm = 1; |
508 | 0 | } |
509 | |
|
510 | 0 | if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >= |
511 | 0 | RDCOST(x->rdmult, 0, tot_sse << 4)) { |
512 | 0 | this_rdc->rate = 0; |
513 | 0 | this_rdc->dist = tot_sse << 4; |
514 | 0 | this_rdc->skip_txfm = 1; |
515 | 0 | } |
516 | |
|
517 | 0 | return tot_sse; |
518 | 0 | } |
519 | | |
520 | | static void compute_intra_yprediction(const AV1_COMMON *cm, |
521 | | PREDICTION_MODE mode, BLOCK_SIZE bsize, |
522 | 0 | MACROBLOCK *x, MACROBLOCKD *xd) { |
523 | 0 | const SequenceHeader *seq_params = cm->seq_params; |
524 | 0 | struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; |
525 | 0 | struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; |
526 | 0 | uint8_t *const src_buf_base = p->src.buf; |
527 | 0 | uint8_t *const dst_buf_base = pd->dst.buf; |
528 | 0 | const int src_stride = p->src.stride; |
529 | 0 | const int dst_stride = pd->dst.stride; |
530 | 0 | int plane = 0; |
531 | 0 | int row, col; |
532 | | // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") |
533 | | // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 |
534 | | // transform size varies per plane, look it up in a common way. |
535 | 0 | const TX_SIZE tx_size = max_txsize_lookup[bsize]; |
536 | 0 | const BLOCK_SIZE plane_bsize = |
537 | 0 | get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); |
538 | | // If mb_to_right_edge is < 0 we are in a situation in which |
539 | | // the current block size extends into the UMV and we won't |
540 | | // visit the sub blocks that are wholly within the UMV. |
541 | 0 | const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); |
542 | 0 | const int max_blocks_high = max_block_high(xd, plane_bsize, plane); |
543 | | // Keep track of the row and column of the blocks we use so that we know |
544 | | // if we are in the unrestricted motion border. |
545 | 0 | for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { |
546 | | // Skip visiting the sub blocks that are wholly within the UMV. |
547 | 0 | for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { |
548 | 0 | p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; |
549 | 0 | pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; |
550 | 0 | av1_predict_intra_block( |
551 | 0 | xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, |
552 | 0 | block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0, |
553 | 0 | FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride, |
554 | 0 | 0, 0, plane); |
555 | 0 | } |
556 | 0 | } |
557 | 0 | p->src.buf = src_buf_base; |
558 | 0 | pd->dst.buf = dst_buf_base; |
559 | 0 | } |
560 | | |
561 | | // Checks whether Intra mode needs to be pruned based on |
562 | | // 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad' |
563 | | // speed features. |
564 | | static inline bool is_prune_intra_mode( |
565 | | AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize, |
566 | | uint8_t segment_id, SOURCE_SAD source_sad_nonrd, |
567 | 0 | uint8_t color_sensitivity[MAX_MB_PLANE - 1]) { |
568 | 0 | const PREDICTION_MODE this_mode = intra_mode_list[mode_index]; |
569 | 0 | if (mode_index > 2 || force_intra_check == 0) { |
570 | 0 | if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize])) |
571 | 0 | return true; |
572 | | |
573 | 0 | if (this_mode == DC_PRED) return false; |
574 | | |
575 | 0 | if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false; |
576 | | |
577 | 0 | const bool has_color_sensitivity = |
578 | 0 | color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] && |
579 | 0 | color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]; |
580 | 0 | if (has_color_sensitivity && |
581 | 0 | (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad || |
582 | 0 | cyclic_refresh_segment_id_boosted(segment_id) || |
583 | 0 | source_sad_nonrd > kMedSad)) |
584 | 0 | return false; |
585 | | |
586 | 0 | return true; |
587 | 0 | } |
588 | 0 | return false; |
589 | 0 | } |
590 | | |
591 | | /*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case. |
592 | | * |
593 | | * \ingroup nonrd_mode_search |
594 | | * \callgraph |
595 | | * \callergraph |
596 | | * Calculates RD Cost for an intra mode for a single TX block using Hadamard |
597 | | * transform. |
598 | | * \param[in] plane Color plane |
599 | | * \param[in] block Index of a TX block in a prediction block |
600 | | * \param[in] row Row of a current TX block |
601 | | * \param[in] col Column of a current TX block |
602 | | * \param[in] plane_bsize Block size of a current prediction block |
603 | | * \param[in] tx_size Transform size |
604 | | * \param[in] arg Pointer to a structure that holds parameters |
605 | | * for intra mode search |
606 | | * |
607 | | * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode |
608 | | * are set in \c args->rdc and \c args->mode |
609 | | */ |
610 | | void av1_estimate_block_intra(int plane, int block, int row, int col, |
611 | | BLOCK_SIZE plane_bsize, TX_SIZE tx_size, |
612 | 0 | void *arg) { |
613 | 0 | struct estimate_block_intra_args *const args = arg; |
614 | 0 | AV1_COMP *const cpi = args->cpi; |
615 | 0 | AV1_COMMON *const cm = &cpi->common; |
616 | 0 | MACROBLOCK *const x = args->x; |
617 | 0 | MACROBLOCKD *const xd = &x->e_mbd; |
618 | 0 | struct macroblock_plane *const p = &x->plane[plane]; |
619 | 0 | struct macroblockd_plane *const pd = &xd->plane[plane]; |
620 | 0 | const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; |
621 | 0 | uint8_t *const src_buf_base = p->src.buf; |
622 | 0 | uint8_t *const dst_buf_base = pd->dst.buf; |
623 | 0 | const int64_t src_stride = p->src.stride; |
624 | 0 | const int64_t dst_stride = pd->dst.stride; |
625 | |
|
626 | 0 | (void)block; |
627 | |
|
628 | 0 | av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); |
629 | |
|
630 | 0 | if (args->prune_mode_based_on_sad || args->prune_palette_sad) { |
631 | 0 | unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf( |
632 | 0 | p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); |
633 | 0 | const unsigned int sad_threshold = |
634 | 0 | args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4) |
635 | 0 | : UINT_MAX; |
636 | | // Skip the evaluation of current mode if its SAD is more than a threshold. |
637 | 0 | if (args->prune_mode_based_on_sad && this_sad > sad_threshold) { |
638 | | // For the current mode, set rate and distortion to maximum possible |
639 | | // values and return. |
640 | | // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip |
641 | | // the evaluation of the current mode. |
642 | 0 | args->rdc->rate = INT_MAX; |
643 | 0 | args->rdc->dist = INT64_MAX; |
644 | 0 | return; |
645 | 0 | } |
646 | 0 | if (this_sad < args->best_sad) { |
647 | 0 | args->best_sad = this_sad; |
648 | 0 | } |
649 | 0 | } |
650 | | |
651 | 0 | RD_STATS this_rdc; |
652 | 0 | av1_invalid_rd_stats(&this_rdc); |
653 | |
|
654 | 0 | p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; |
655 | 0 | pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; |
656 | |
|
657 | 0 | if (plane == 0) { |
658 | 0 | av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx, |
659 | 0 | AOMMIN(tx_size, TX_16X16)); |
660 | 0 | } else { |
661 | 0 | av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane); |
662 | 0 | } |
663 | |
|
664 | 0 | p->src.buf = src_buf_base; |
665 | 0 | pd->dst.buf = dst_buf_base; |
666 | 0 | assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX); |
667 | 0 | args->rdc->rate += this_rdc.rate; |
668 | 0 | args->rdc->dist += this_rdc.dist; |
669 | 0 | } |
670 | | |
671 | | /*!\brief Estimates best intra mode for inter mode search |
672 | | * |
673 | | * \ingroup nonrd_mode_search |
674 | | * \callgraph |
675 | | * \callergraph |
676 | | * |
677 | | * Using heuristics based on best inter mode, block size, and other decides |
678 | | * whether to check intra modes. If so, estimates and selects best intra mode |
679 | | * from the reduced set of intra modes (max 4 intra modes checked) |
680 | | * |
681 | | * \param[in] cpi Top-level encoder structure |
682 | | * \param[in] x Pointer to structure holding all the |
683 | | * data for the current macroblock |
684 | | * \param[in] bsize Current block size |
685 | | * \param[in] best_early_term Flag, indicating that TX for the |
686 | | * best inter mode was skipped |
687 | | * \param[in] ref_cost_intra Cost of signalling intra mode |
688 | | * \param[in] reuse_prediction Flag, indicating prediction re-use |
689 | | * \param[in] orig_dst Original destination buffer |
690 | | * \param[in] tmp_buffers Pointer to a temporary buffers for |
691 | | * prediction re-use |
692 | | * \param[out] this_mode_pred Pointer to store prediction buffer |
693 | | * for prediction re-use |
694 | | * \param[in] best_rdc Pointer to RD cost for the best |
695 | | * selected intra mode |
696 | | * \param[in] best_pickmode Pointer to a structure containing |
697 | | * best mode picked so far |
698 | | * \param[in] ctx Pointer to structure holding coding |
699 | | * contexts and modes for the block |
700 | | * |
701 | | * \remark Nothing is returned. Instead, calculated RD cost is placed to |
702 | | * \c best_rdc and best selected mode is placed to \c best_pickmode |
703 | | * |
704 | | */ |
705 | | void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, |
706 | | int best_early_term, unsigned int ref_cost_intra, |
707 | | int reuse_prediction, struct buf_2d *orig_dst, |
708 | | PRED_BUFFER *tmp_buffers, |
709 | | PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, |
710 | | BEST_PICKMODE *best_pickmode, |
711 | | PICK_MODE_CONTEXT *ctx, |
712 | 0 | unsigned int *best_sad_norm) { |
713 | 0 | AV1_COMMON *const cm = &cpi->common; |
714 | 0 | MACROBLOCKD *const xd = &x->e_mbd; |
715 | 0 | MB_MODE_INFO *const mi = xd->mi[0]; |
716 | 0 | const TxfmSearchParams *txfm_params = &x->txfm_search_params; |
717 | 0 | const unsigned char segment_id = mi->segment_id; |
718 | 0 | const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; |
719 | 0 | const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize]; |
720 | 0 | const bool is_screen_content = |
721 | 0 | cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; |
722 | 0 | struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; |
723 | 0 | const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; |
724 | |
|
725 | 0 | const CommonQuantParams *quant_params = &cm->quant_params; |
726 | |
|
727 | 0 | RD_STATS this_rdc; |
728 | |
|
729 | 0 | int intra_cost_penalty = av1_get_intra_cost_penalty( |
730 | 0 | quant_params->base_qindex, quant_params->y_dc_delta_q, |
731 | 0 | cm->seq_params->bit_depth); |
732 | 0 | int64_t inter_mode_thresh = |
733 | 0 | RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); |
734 | 0 | int perform_intra_pred = rt_sf->check_intra_pred_nonrd; |
735 | 0 | int force_intra_check = 0; |
736 | | // For spatial enhancement layer: turn off intra prediction if the |
737 | | // previous spatial layer as golden ref is not chosen as best reference. |
738 | | // only do this for temporal enhancement layer and on non-key frames. |
739 | 0 | if (cpi->svc.spatial_layer_id > 0 && |
740 | 0 | best_pickmode->best_ref_frame != GOLDEN_FRAME && |
741 | 0 | cpi->svc.temporal_layer_id > 0 && |
742 | 0 | !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) |
743 | 0 | perform_intra_pred = 0; |
744 | |
|
745 | 0 | int do_early_exit_rdthresh = 1; |
746 | |
|
747 | 0 | uint32_t spatial_var_thresh = 50; |
748 | 0 | int motion_thresh = 32; |
749 | | // Adjust thresholds to make intra mode likely tested if the other |
750 | | // references (golden, alt) are skipped/not checked. For now always |
751 | | // adjust for svc mode. |
752 | 0 | if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 && |
753 | 0 | rt_sf->nonrd_prune_ref_frame_search > 0)) { |
754 | 0 | spatial_var_thresh = 150; |
755 | 0 | motion_thresh = 0; |
756 | 0 | } |
757 | | |
758 | | // Some adjustments to checking intra mode based on source variance. |
759 | 0 | if (x->source_variance < spatial_var_thresh) { |
760 | | // If the best inter mode is large motion or non-LAST ref reduce intra cost |
761 | | // penalty, so intra mode is more likely tested. |
762 | 0 | if (best_rdc->rdcost != INT64_MAX && |
763 | 0 | (best_pickmode->best_ref_frame != LAST_FRAME || |
764 | 0 | abs(mi->mv[0].as_mv.row) >= motion_thresh || |
765 | 0 | abs(mi->mv[0].as_mv.col) >= motion_thresh)) { |
766 | 0 | intra_cost_penalty = intra_cost_penalty >> 2; |
767 | 0 | inter_mode_thresh = |
768 | 0 | RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); |
769 | 0 | do_early_exit_rdthresh = 0; |
770 | 0 | } |
771 | 0 | if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) && |
772 | 0 | x->content_state_sb.source_sad_nonrd >= kHighSad) || |
773 | 0 | (is_screen_content && x->source_variance < 50 && |
774 | 0 | ((bsize >= BLOCK_32X32 && |
775 | 0 | x->content_state_sb.source_sad_nonrd != kZeroSad) || |
776 | 0 | x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || |
777 | 0 | x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))) |
778 | 0 | force_intra_check = 1; |
779 | | // For big blocks worth checking intra (since only DC will be checked), |
780 | | // even if best_early_term is set. |
781 | 0 | if (bsize >= BLOCK_32X32) best_early_term = 0; |
782 | 0 | } else if (rt_sf->source_metrics_sb_nonrd && |
783 | 0 | x->content_state_sb.source_sad_nonrd <= kLowSad) { |
784 | 0 | perform_intra_pred = 0; |
785 | 0 | } |
786 | |
|
787 | 0 | if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) { |
788 | 0 | if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV) |
789 | 0 | perform_intra_pred = 0; |
790 | 0 | else if (rt_sf->skip_intra_pred == 2) |
791 | 0 | perform_intra_pred = 0; |
792 | 0 | } |
793 | |
|
794 | 0 | if (!(best_rdc->rdcost == INT64_MAX || force_intra_check || |
795 | 0 | (perform_intra_pred && !best_early_term && |
796 | 0 | bsize <= cpi->sf.part_sf.max_intra_bsize))) { |
797 | 0 | return; |
798 | 0 | } |
799 | | |
800 | | // Early exit based on RD cost calculated using known rate. When |
801 | | // is_screen_content is true, more bias is given to intra modes. Hence, |
802 | | // considered conservative threshold in early exit for the same. |
803 | 0 | const int64_t known_rd = is_screen_content |
804 | 0 | ? CALC_BIASED_RDCOST(inter_mode_thresh) |
805 | 0 | : inter_mode_thresh; |
806 | 0 | if (known_rd > best_rdc->rdcost) return; |
807 | | |
808 | 0 | struct estimate_block_intra_args args; |
809 | 0 | init_estimate_block_intra_args(&args, cpi, x); |
810 | 0 | if (prune_palette_testing_inter(cpi, x->source_variance)) |
811 | 0 | args.prune_palette_sad = true; |
812 | 0 | TX_SIZE intra_tx_size = AOMMIN( |
813 | 0 | AOMMIN(max_txsize_lookup[bsize], |
814 | 0 | tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), |
815 | 0 | TX_16X16); |
816 | 0 | if (is_screen_content && cpi->rc.high_source_sad && |
817 | 0 | x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16) |
818 | 0 | intra_tx_size = TX_4X4; |
819 | |
|
820 | 0 | PRED_BUFFER *const best_pred = best_pickmode->best_pred; |
821 | 0 | if (reuse_prediction && best_pred != NULL) { |
822 | 0 | const int bh = block_size_high[bsize]; |
823 | 0 | const int bw = block_size_wide[bsize]; |
824 | 0 | if (best_pred->data == orig_dst->buf) { |
825 | 0 | *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)]; |
826 | 0 | aom_convolve_copy(best_pred->data, best_pred->stride, |
827 | 0 | (*this_mode_pred)->data, (*this_mode_pred)->stride, bw, |
828 | 0 | bh); |
829 | 0 | best_pickmode->best_pred = *this_mode_pred; |
830 | 0 | } |
831 | 0 | } |
832 | 0 | pd->dst = *orig_dst; |
833 | |
|
834 | 0 | for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) { |
835 | 0 | const PREDICTION_MODE this_mode = intra_mode_list[midx]; |
836 | 0 | const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; |
837 | 0 | const int64_t mode_rd_thresh = rd_threshes[mode_index]; |
838 | |
|
839 | 0 | if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id, |
840 | 0 | x->content_state_sb.source_sad_nonrd, |
841 | 0 | x->color_sensitivity)) |
842 | 0 | continue; |
843 | | |
844 | 0 | if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { |
845 | | // For spatially flat blocks with zero motion only check |
846 | | // DC mode. |
847 | 0 | if (x->content_state_sb.source_sad_nonrd == kZeroSad && |
848 | 0 | x->source_variance == 0 && this_mode != DC_PRED) |
849 | 0 | continue; |
850 | | // Only test Intra for big blocks if spatial_variance is small. |
851 | 0 | else if (bsize > BLOCK_32X32 && x->source_variance > 50) |
852 | 0 | continue; |
853 | 0 | } |
854 | | |
855 | 0 | if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh, |
856 | 0 | rd_thresh_freq_fact[mode_index]) && |
857 | 0 | (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) { |
858 | 0 | continue; |
859 | 0 | } |
860 | 0 | const BLOCK_SIZE uv_bsize = |
861 | 0 | get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, |
862 | 0 | xd->plane[AOM_PLANE_U].subsampling_y); |
863 | |
|
864 | 0 | mi->mode = this_mode; |
865 | 0 | mi->ref_frame[0] = INTRA_FRAME; |
866 | 0 | mi->ref_frame[1] = NONE_FRAME; |
867 | |
|
868 | 0 | av1_invalid_rd_stats(&this_rdc); |
869 | 0 | args.mode = this_mode; |
870 | 0 | args.skippable = 1; |
871 | 0 | args.rdc = &this_rdc; |
872 | 0 | mi->tx_size = intra_tx_size; |
873 | 0 | compute_intra_yprediction(cm, this_mode, bsize, x, xd); |
874 | | // Look into selecting tx_size here, based on prediction residual. |
875 | 0 | av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size); |
876 | | // TODO(kyslov@) Need to account for skippable |
877 | 0 | if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { |
878 | 0 | av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U, |
879 | 0 | av1_estimate_block_intra, &args); |
880 | 0 | } |
881 | 0 | if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { |
882 | 0 | av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V, |
883 | 0 | av1_estimate_block_intra, &args); |
884 | 0 | } |
885 | |
|
886 | 0 | int mode_cost = 0; |
887 | 0 | if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) { |
888 | 0 | mode_cost += |
889 | 0 | x->mode_costs.angle_delta_cost[this_mode - V_PRED] |
890 | 0 | [MAX_ANGLE_DELTA + |
891 | 0 | mi->angle_delta[PLANE_TYPE_Y]]; |
892 | 0 | } |
893 | 0 | if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { |
894 | 0 | mode_cost += x->mode_costs.filter_intra_cost[bsize][0]; |
895 | 0 | } |
896 | 0 | this_rdc.rate += ref_cost_intra; |
897 | 0 | this_rdc.rate += intra_cost_penalty; |
898 | 0 | this_rdc.rate += mode_cost; |
899 | 0 | this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); |
900 | |
|
901 | 0 | if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { |
902 | | // For blocks with low spatial variance and color sad, |
903 | | // favor the intra-modes, only on scene/slide change. |
904 | 0 | if (cpi->rc.high_source_sad && x->source_variance < 800 && |
905 | 0 | (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || |
906 | 0 | x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) |
907 | 0 | this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost); |
908 | | // Otherwise bias against intra for blocks with zero |
909 | | // motion and no color, on non-scene/slide changes. |
910 | 0 | else if (!cpi->rc.high_source_sad && x->source_variance > 0 && |
911 | 0 | x->content_state_sb.source_sad_nonrd == kZeroSad && |
912 | 0 | x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && |
913 | 0 | x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) |
914 | 0 | this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1; |
915 | 0 | } |
916 | |
|
917 | 0 | if (this_rdc.rdcost < best_rdc->rdcost) { |
918 | 0 | *best_rdc = this_rdc; |
919 | 0 | best_pickmode->best_mode = this_mode; |
920 | 0 | best_pickmode->best_tx_size = mi->tx_size; |
921 | 0 | best_pickmode->best_ref_frame = INTRA_FRAME; |
922 | 0 | best_pickmode->best_second_ref_frame = NONE; |
923 | 0 | best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm; |
924 | 0 | mi->uv_mode = this_mode; |
925 | 0 | mi->mv[0].as_int = INVALID_MV; |
926 | 0 | mi->mv[1].as_int = INVALID_MV; |
927 | 0 | if (!this_rdc.skip_txfm) |
928 | 0 | memset(ctx->blk_skip, 0, |
929 | 0 | sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); |
930 | 0 | } |
931 | 0 | } |
932 | 0 | if (best_pickmode->best_ref_frame == INTRA_FRAME) |
933 | 0 | memset(ctx->blk_skip, 0, |
934 | 0 | sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); |
935 | 0 | mi->tx_size = best_pickmode->best_tx_size; |
936 | |
|
937 | 0 | *best_sad_norm = args.best_sad >> |
938 | 0 | (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); |
939 | 0 | } |