/src/libvpx/vp9/encoder/vp9_encodemb.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <stdlib.h> |
12 | | |
13 | | #include "./vp9_rtcd.h" |
14 | | #include "./vpx_config.h" |
15 | | #include "./vpx_dsp_rtcd.h" |
16 | | |
17 | | #include "vpx_dsp/quantize.h" |
18 | | #include "vpx_mem/vpx_mem.h" |
19 | | #include "vpx_ports/mem.h" |
20 | | |
21 | | #if CONFIG_MISMATCH_DEBUG |
22 | | #include "vpx_util/vpx_debug_util.h" |
23 | | #endif |
24 | | |
25 | | #include "vp9/common/vp9_idct.h" |
26 | | #include "vp9/common/vp9_reconinter.h" |
27 | | #include "vp9/common/vp9_reconintra.h" |
28 | | #include "vp9/common/vp9_scan.h" |
29 | | |
30 | | #include "vp9/encoder/vp9_encodemb.h" |
31 | | #include "vp9/encoder/vp9_encoder.h" |
32 | | #include "vp9/encoder/vp9_rd.h" |
33 | | #include "vp9/encoder/vp9_tokenize.h" |
34 | | |
35 | | #if defined(NDEBUG) |
36 | | #if defined(__clang__) && defined(__has_builtin) |
37 | | #if __has_builtin(__builtin_assume) |
38 | | // This is verified by test/vp9_scan_test.cc |
39 | | #define ASSUME_VALID_SCAN_VALUE(i) \ |
40 | 916M | __builtin_assume(0 <= i && i <= MAX_SCAN_VALUE) |
41 | | // This is verified by test/vp9_entropy_test.cc |
42 | | #define ASSUME_VALID_ENERGY_CLASS(i) \ |
43 | 933M | __builtin_assume(0 <= i && i <= MAX_ENERGY_CLASS) |
44 | 869M | #define ASSUME_VALID_TOKEN(i) __builtin_assume(0 <= i && i <= MAX_TOKEN) |
45 | | #else |
46 | | #define ASSUME_VALID_SCAN_VALUE(i) \ |
47 | | do { \ |
48 | | } while (0) |
49 | | #define ASSUME_VALID_ENERGY_CLASS(i) \ |
50 | | do { \ |
51 | | } while (0) |
52 | | #define ASSUME_VALID_TOKEN(i) \ |
53 | | do { \ |
54 | | } while (0) |
55 | | #endif |
56 | | #else |
57 | | #define ASSUME_VALID_SCAN_VALUE(i) \ |
58 | | do { \ |
59 | | } while (0) |
60 | | #define ASSUME_VALID_ENERGY_CLASS(i) \ |
61 | | do { \ |
62 | | } while (0) |
63 | | #define ASSUME_VALID_TOKEN(i) \ |
64 | | do { \ |
65 | | } while (0) |
66 | | #endif |
67 | | #else |
68 | | #define ASSUME_VALID_SCAN_VALUE(i) assert(0 <= i && i <= MAX_SCAN_VALUE) |
69 | | #define ASSUME_VALID_ENERGY_CLASS(i) assert(0 <= i && i <= MAX_ENERGY_CLASS) |
70 | | #define ASSUME_VALID_TOKEN(i) assert(0 <= i && i <= MAX_TOKEN) |
71 | | #endif |
72 | | |
73 | | struct optimize_ctx { |
74 | | ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; |
75 | | ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; |
76 | | }; |
77 | | |
78 | 30.5M | void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { |
79 | 30.5M | struct macroblock_plane *const p = &x->plane[plane]; |
80 | 30.5M | const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; |
81 | 30.5M | const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); |
82 | 30.5M | const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; |
83 | 30.5M | const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize]; |
84 | | |
85 | 30.5M | #if CONFIG_VP9_HIGHBITDEPTH |
86 | 30.5M | if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
87 | 0 | vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, |
88 | 0 | p->src.stride, pd->dst.buf, pd->dst.stride, |
89 | 0 | x->e_mbd.bd); |
90 | 0 | return; |
91 | 0 | } |
92 | 30.5M | #endif // CONFIG_VP9_HIGHBITDEPTH |
93 | 30.5M | vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, |
94 | 30.5M | pd->dst.buf, pd->dst.stride); |
95 | 30.5M | } |
96 | | |
97 | | static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { |
98 | | { 10, 6 }, |
99 | | { 8, 5 }, |
100 | | }; |
101 | | |
102 | | // 'num' can be negative, but 'shift' must be non-negative. |
103 | | #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ |
104 | 1.42M | (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))) |
105 | | |
106 | | int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, |
107 | 23.2M | int ctx) { |
108 | 23.2M | MACROBLOCKD *const xd = &mb->e_mbd; |
109 | 23.2M | struct macroblock_plane *const p = &mb->plane[plane]; |
110 | 23.2M | struct macroblockd_plane *const pd = &xd->plane[plane]; |
111 | 23.2M | const int ref = is_inter_block(xd->mi[0]); |
112 | 23.2M | uint8_t token_cache[MAX_SCAN_VALUE + 1]; |
113 | 23.2M | const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); |
114 | 23.2M | tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); |
115 | 23.2M | tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
116 | 23.2M | const int eob = p->eobs[block]; |
117 | 23.2M | const PLANE_TYPE plane_type = get_plane_type(plane); |
118 | 23.2M | const int default_eob = 16 << (tx_size << 1); |
119 | 23.2M | const int shift = (tx_size == TX_32X32); |
120 | 23.2M | const int16_t *const dequant_ptr = pd->dequant; |
121 | 23.2M | const uint8_t *const band_translate = get_band_translate(tx_size); |
122 | 23.2M | const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block); |
123 | 23.2M | const int16_t *const scan = so->scan; |
124 | 23.2M | const int16_t *const nb = so->neighbors; |
125 | 23.2M | const MODE_INFO *mbmi = xd->mi[0]; |
126 | 23.2M | const int sharpness = mb->sharpness; |
127 | 23.2M | const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]; |
128 | 23.2M | const int64_t rdmult = |
129 | 23.2M | (sharpness == 0 ? rdadj >> 1 |
130 | 23.2M | : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4); |
131 | | |
132 | 23.2M | const int64_t rddiv = mb->rddiv; |
133 | 23.2M | int64_t rd_cost0, rd_cost1; |
134 | 23.2M | int64_t rate0, rate1; |
135 | 23.2M | int16_t t0, t1; |
136 | 23.2M | int i, final_eob; |
137 | 23.2M | int count_high_values_after_eob = 0; |
138 | 23.2M | #if CONFIG_VP9_HIGHBITDEPTH |
139 | 23.2M | const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); |
140 | | #else |
141 | | const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); |
142 | | #endif |
143 | 23.2M | unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = |
144 | 23.2M | mb->token_costs[tx_size][plane_type][ref]; |
145 | 23.2M | unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS]; |
146 | 23.2M | int64_t eob_cost0, eob_cost1; |
147 | 23.2M | int64_t accu_rate = 0; |
148 | | // Initialized to the worst possible error for the largest transform size. |
149 | | // This ensures that it never goes negative. |
150 | 23.2M | int64_t accu_error = ((int64_t)1) << 50; |
151 | 23.2M | int64_t best_block_rd_cost = INT64_MAX; |
152 | 23.2M | int x_prev = 1; |
153 | 23.2M | tran_low_t before_best_eob_qc = 0; |
154 | 23.2M | tran_low_t before_best_eob_dqc = 0; |
155 | | |
156 | 23.2M | assert((!plane_type && !plane) || (plane_type && plane)); |
157 | 23.2M | assert(eob <= default_eob); |
158 | | |
159 | 460M | for (i = 0; i < eob; i++) { |
160 | 437M | const int rc = scan[i]; |
161 | 437M | ASSUME_VALID_SCAN_VALUE(rc); |
162 | 437M | int16_t token = vp9_get_token(qcoeff[rc]); |
163 | 437M | ASSUME_VALID_TOKEN(token); |
164 | 437M | token_cache[rc] = vp9_pt_energy_class[token]; |
165 | 437M | } |
166 | 23.2M | final_eob = 0; |
167 | | |
168 | | // This is used in the first iteration, and must be inbounds. We cannot |
169 | | // locally verify that this is in bounds, so we need to verify at runtime. |
170 | | // For now, only verify if we have array-bounds turned on. |
171 | 23.2M | #if defined(__clang__) && defined(__has_feature) |
172 | | #if __has_feature(array_bounds_sanitizer) |
173 | | if (ctx < 0 || ctx > MAX_ENERGY_CLASS) { |
174 | | abort(); |
175 | | } |
176 | | #endif |
177 | 23.2M | #endif |
178 | | |
179 | | // Initial RD cost. |
180 | 23.2M | token_costs_cur = token_costs + band_translate[0]; |
181 | 23.2M | rate0 = (*token_costs_cur)[0][ctx][EOB_TOKEN]; |
182 | 23.2M | best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); |
183 | | |
184 | | // For each token, pick one of two choices greedily: |
185 | | // (i) First candidate: Keep current quantized value, OR |
186 | | // (ii) Second candidate: Reduce quantized value by 1. |
187 | 460M | for (i = 0; i < eob; i++) { |
188 | 437M | const int rc = scan[i]; |
189 | 437M | ASSUME_VALID_SCAN_VALUE(rc); |
190 | 437M | const int x = qcoeff[rc]; |
191 | 437M | const int band_cur = band_translate[i]; |
192 | 437M | const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); |
193 | 437M | ASSUME_VALID_ENERGY_CLASS(ctx_cur); |
194 | 437M | const int token_tree_sel_cur = (x_prev == 0); |
195 | 437M | token_costs_cur = token_costs + band_cur; |
196 | 437M | if (x == 0) { // No need to search |
197 | 184M | const int token = vp9_get_token(x); |
198 | 184M | ASSUME_VALID_TOKEN(token); |
199 | 184M | rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token]; |
200 | 184M | accu_rate += rate0; |
201 | 184M | x_prev = 0; |
202 | | // Note: accu_error does not change. |
203 | 253M | } else { |
204 | 253M | const int dqv = dequant_ptr[rc != 0]; |
205 | | // Compute the distortion for quantizing to 0. |
206 | 253M | const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift); |
207 | 253M | const int diff_for_zero = |
208 | 253M | #if CONFIG_VP9_HIGHBITDEPTH |
209 | 253M | (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) |
210 | 253M | ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8) |
211 | 253M | : |
212 | 253M | #endif |
213 | 253M | diff_for_zero_raw; |
214 | 253M | const int64_t distortion_for_zero = |
215 | 253M | (int64_t)diff_for_zero * diff_for_zero; |
216 | | |
217 | | // Compute the distortion for the first candidate |
218 | 253M | const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift); |
219 | 253M | const int diff0 = |
220 | 253M | #if CONFIG_VP9_HIGHBITDEPTH |
221 | 253M | (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) |
222 | 253M | ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8) |
223 | 253M | : |
224 | 253M | #endif // CONFIG_VP9_HIGHBITDEPTH |
225 | 253M | diff0_raw; |
226 | 253M | const int64_t distortion0 = (int64_t)diff0 * diff0; |
227 | | |
228 | | // Compute the distortion for the second candidate |
229 | 253M | const int sign = -(x < 0); // -1 if x is negative and 0 otherwise. |
230 | 253M | const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1. |
231 | 253M | int64_t distortion1; |
232 | 253M | if (x1 != 0) { |
233 | 169M | const int dqv_step = |
234 | 169M | #if CONFIG_VP9_HIGHBITDEPTH |
235 | 169M | (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8) |
236 | 169M | : |
237 | 169M | #endif // CONFIG_VP9_HIGHBITDEPTH |
238 | 169M | dqv; |
239 | 169M | const int diff_step = (dqv_step + sign) ^ sign; |
240 | 169M | const int diff1 = diff0 - diff_step; |
241 | 169M | assert(dqv > 0); // We aren't right shifting a negative number above. |
242 | 169M | distortion1 = (int64_t)diff1 * diff1; |
243 | 169M | } else { |
244 | 83.8M | distortion1 = distortion_for_zero; |
245 | 83.8M | } |
246 | 253M | { |
247 | | // Calculate RDCost for current coeff for the two candidates. |
248 | 253M | const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost); |
249 | 253M | const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost); |
250 | 253M | rate0 = |
251 | 253M | base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0]; |
252 | 253M | rate1 = |
253 | 253M | base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1]; |
254 | 253M | } |
255 | 253M | { |
256 | 253M | int rdcost_better_for_x1, eob_rdcost_better_for_x1; |
257 | 253M | int dqc0, dqc1; |
258 | 253M | int64_t best_eob_cost_cur; |
259 | 253M | int use_x1; |
260 | | |
261 | | // Calculate RD Cost effect on the next coeff for the two candidates. |
262 | 253M | int64_t next_bits0 = 0; |
263 | 253M | int64_t next_bits1 = 0; |
264 | 253M | int64_t next_eob_bits0 = 0; |
265 | 253M | int64_t next_eob_bits1 = 0; |
266 | 253M | if (i < default_eob - 1) { |
267 | 247M | int ctx_next, token_tree_sel_next; |
268 | 247M | const int band_next = band_translate[i + 1]; |
269 | 247M | const int token_next = |
270 | 247M | (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; |
271 | 247M | ASSUME_VALID_TOKEN(token_next); |
272 | 247M | unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS] |
273 | 247M | [ENTROPY_TOKENS] = |
274 | 247M | token_costs + band_next; |
275 | 247M | token_cache[rc] = vp9_pt_energy_class[t0]; |
276 | 247M | ctx_next = get_coef_context(nb, token_cache, i + 1); |
277 | | // token_cache is initialized with valid energy classes. |
278 | | // get_coef_context returns at most the maximum value of |
279 | | // token_cache. |
280 | 247M | ASSUME_VALID_ENERGY_CLASS(ctx_next); |
281 | 247M | token_tree_sel_next = (x == 0); |
282 | 247M | next_bits0 = |
283 | 247M | (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; |
284 | 247M | next_eob_bits0 = |
285 | 247M | (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; |
286 | 247M | token_cache[rc] = vp9_pt_energy_class[t1]; |
287 | 247M | ctx_next = get_coef_context(nb, token_cache, i + 1); |
288 | | // token_cache is initialized with valid energy classes. |
289 | | // get_coef_context returns at most the maximum value of |
290 | | // token_cache. |
291 | 247M | ASSUME_VALID_ENERGY_CLASS(ctx_next); |
292 | 247M | token_tree_sel_next = (x1 == 0); |
293 | 247M | next_bits1 = |
294 | 247M | (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; |
295 | 247M | if (x1 != 0) { |
296 | 165M | next_eob_bits1 = |
297 | 165M | (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; |
298 | 165M | } |
299 | 247M | } |
300 | | |
301 | | // Compare the total RD costs for two candidates. |
302 | 253M | rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0); |
303 | 253M | rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1); |
304 | 253M | rdcost_better_for_x1 = (rd_cost1 < rd_cost0); |
305 | 253M | eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), |
306 | 253M | (accu_error + distortion0 - distortion_for_zero)); |
307 | 253M | eob_cost1 = eob_cost0; |
308 | 253M | if (x1 != 0) { |
309 | 169M | eob_cost1 = |
310 | 169M | RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), |
311 | 169M | (accu_error + distortion1 - distortion_for_zero)); |
312 | 169M | eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0); |
313 | 169M | } else { |
314 | 83.8M | eob_rdcost_better_for_x1 = 0; |
315 | 83.8M | } |
316 | | |
317 | | // Calculate the two candidate de-quantized values. |
318 | 253M | dqc0 = dqcoeff[rc]; |
319 | 253M | dqc1 = 0; |
320 | 253M | if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) { |
321 | 3.29M | if (x1 != 0) { |
322 | 1.42M | dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift); |
323 | 1.86M | } else { |
324 | 1.86M | dqc1 = 0; |
325 | 1.86M | } |
326 | 3.29M | } |
327 | | |
328 | | // Pick and record the better quantized and de-quantized values. |
329 | 253M | if (rdcost_better_for_x1) { |
330 | 3.17M | qcoeff[rc] = x1; |
331 | 3.17M | dqcoeff[rc] = dqc1; |
332 | 3.17M | accu_rate += rate1; |
333 | 3.17M | accu_error += distortion1 - distortion_for_zero; |
334 | 3.17M | assert(distortion1 <= distortion_for_zero); |
335 | 3.17M | token_cache[rc] = vp9_pt_energy_class[t1]; |
336 | 250M | } else { |
337 | 250M | accu_rate += rate0; |
338 | 250M | accu_error += distortion0 - distortion_for_zero; |
339 | 250M | assert(distortion0 <= distortion_for_zero); |
340 | 250M | token_cache[rc] = vp9_pt_energy_class[t0]; |
341 | 250M | } |
342 | 253M | if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++; |
343 | 253M | assert(accu_error >= 0); |
344 | 253M | x_prev = qcoeff[rc]; // Update based on selected quantized value. |
345 | | |
346 | 253M | use_x1 = (x1 != 0) && eob_rdcost_better_for_x1; |
347 | 253M | best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0; |
348 | | |
349 | | // Determine whether to move the eob position to i+1 |
350 | 253M | if (best_eob_cost_cur < best_block_rd_cost) { |
351 | 240M | best_block_rd_cost = best_eob_cost_cur; |
352 | 240M | final_eob = i + 1; |
353 | 240M | count_high_values_after_eob = 0; |
354 | 240M | if (use_x1) { |
355 | 1.41M | before_best_eob_qc = x1; |
356 | 1.41M | before_best_eob_dqc = dqc1; |
357 | 239M | } else { |
358 | 239M | before_best_eob_qc = x; |
359 | 239M | before_best_eob_dqc = dqc0; |
360 | 239M | } |
361 | 240M | } |
362 | 253M | } |
363 | 253M | } |
364 | 437M | } |
365 | 23.2M | if (count_high_values_after_eob > 0) { |
366 | 0 | final_eob = eob - 1; |
367 | 0 | for (; final_eob >= 0; final_eob--) { |
368 | 0 | const int rc = scan[final_eob]; |
369 | 0 | ASSUME_VALID_SCAN_VALUE(rc); |
370 | 0 | const int x = qcoeff[rc]; |
371 | 0 | if (x) { |
372 | 0 | break; |
373 | 0 | } |
374 | 0 | } |
375 | 0 | final_eob++; |
376 | 23.2M | } else { |
377 | 23.2M | assert(final_eob <= eob); |
378 | 23.2M | if (final_eob > 0) { |
379 | 11.6M | int rc; |
380 | 11.6M | assert(before_best_eob_qc != 0); |
381 | 11.6M | i = final_eob - 1; |
382 | 11.6M | rc = scan[i]; |
383 | 11.6M | ASSUME_VALID_SCAN_VALUE(rc); |
384 | 11.6M | qcoeff[rc] = before_best_eob_qc; |
385 | 11.6M | dqcoeff[rc] = before_best_eob_dqc; |
386 | 11.6M | } |
387 | 52.9M | for (i = final_eob; i < eob; i++) { |
388 | 29.6M | int rc = scan[i]; |
389 | 29.6M | ASSUME_VALID_SCAN_VALUE(rc); |
390 | 29.6M | qcoeff[rc] = 0; |
391 | 29.6M | dqcoeff[rc] = 0; |
392 | 29.6M | } |
393 | 23.2M | } |
394 | 23.2M | mb->plane[plane].eobs[block] = final_eob; |
395 | 23.2M | return final_eob; |
396 | 23.2M | } |
397 | | #undef RIGHT_SHIFT_POSSIBLY_NEGATIVE |
398 | | |
399 | | static INLINE void fdct32x32(int rd_transform, const int16_t *src, |
400 | 3.80M | tran_low_t *dst, int src_stride) { |
401 | 3.80M | if (rd_transform) |
402 | 3.62M | vpx_fdct32x32_rd(src, dst, src_stride); |
403 | 176k | else |
404 | 176k | vpx_fdct32x32(src, dst, src_stride); |
405 | 3.80M | } |
406 | | |
407 | | #if CONFIG_VP9_HIGHBITDEPTH |
408 | | static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src, |
409 | 0 | tran_low_t *dst, int src_stride) { |
410 | 0 | if (rd_transform) |
411 | 0 | vpx_highbd_fdct32x32_rd(src, dst, src_stride); |
412 | 0 | else |
413 | 0 | vpx_highbd_fdct32x32(src, dst, src_stride); |
414 | 0 | } |
415 | | #endif // CONFIG_VP9_HIGHBITDEPTH |
416 | | |
417 | | void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, |
418 | 0 | BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { |
419 | 0 | MACROBLOCKD *const xd = &x->e_mbd; |
420 | 0 | const struct macroblock_plane *const p = &x->plane[plane]; |
421 | 0 | const struct macroblockd_plane *const pd = &xd->plane[plane]; |
422 | 0 | const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; |
423 | 0 | tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); |
424 | 0 | tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); |
425 | 0 | tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
426 | 0 | uint16_t *const eob = &p->eobs[block]; |
427 | 0 | const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; |
428 | 0 | const int16_t *src_diff; |
429 | 0 | src_diff = &p->src_diff[4 * (row * diff_stride + col)]; |
430 | | // skip block condition should be handled before this is called. |
431 | 0 | assert(!x->skip_block); |
432 | |
|
433 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
434 | 0 | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
435 | 0 | switch (tx_size) { |
436 | 0 | case TX_32X32: |
437 | 0 | highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); |
438 | 0 | vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, |
439 | 0 | pd->dequant, eob, scan_order); |
440 | 0 | break; |
441 | 0 | case TX_16X16: |
442 | 0 | vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); |
443 | 0 | vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, |
444 | 0 | scan_order); |
445 | 0 | break; |
446 | 0 | case TX_8X8: |
447 | 0 | vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); |
448 | 0 | vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, |
449 | 0 | scan_order); |
450 | 0 | break; |
451 | 0 | default: |
452 | 0 | assert(tx_size == TX_4X4); |
453 | 0 | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
454 | 0 | vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, |
455 | 0 | scan_order); |
456 | 0 | break; |
457 | 0 | } |
458 | 0 | return; |
459 | 0 | } |
460 | 0 | #endif // CONFIG_VP9_HIGHBITDEPTH |
461 | | |
462 | 0 | switch (tx_size) { |
463 | 0 | case TX_32X32: |
464 | 0 | fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); |
465 | 0 | vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob, |
466 | 0 | scan_order); |
467 | 0 | break; |
468 | 0 | case TX_16X16: |
469 | 0 | vpx_fdct16x16(src_diff, coeff, diff_stride); |
470 | 0 | vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, |
471 | 0 | scan_order); |
472 | 0 | break; |
473 | 0 | case TX_8X8: |
474 | 0 | vpx_fdct8x8(src_diff, coeff, diff_stride); |
475 | 0 | vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, |
476 | 0 | scan_order); |
477 | |
|
478 | 0 | break; |
479 | 0 | default: |
480 | 0 | assert(tx_size == TX_4X4); |
481 | 0 | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
482 | 0 | vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, |
483 | 0 | scan_order); |
484 | 0 | break; |
485 | 0 | } |
486 | 0 | } |
487 | | |
488 | | void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, |
489 | 313k | BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { |
490 | 313k | MACROBLOCKD *const xd = &x->e_mbd; |
491 | 313k | const struct macroblock_plane *const p = &x->plane[plane]; |
492 | 313k | const struct macroblockd_plane *const pd = &xd->plane[plane]; |
493 | 313k | tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); |
494 | 313k | tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); |
495 | 313k | tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
496 | 313k | uint16_t *const eob = &p->eobs[block]; |
497 | 313k | const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; |
498 | 313k | const int16_t *src_diff; |
499 | 313k | src_diff = &p->src_diff[4 * (row * diff_stride + col)]; |
500 | | // skip block condition should be handled before this is called. |
501 | 313k | assert(!x->skip_block); |
502 | | |
503 | 313k | #if CONFIG_VP9_HIGHBITDEPTH |
504 | 313k | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
505 | 0 | switch (tx_size) { |
506 | 0 | case TX_32X32: |
507 | 0 | vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride); |
508 | 0 | vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, |
509 | 0 | dqcoeff, pd->dequant[0], eob); |
510 | 0 | break; |
511 | 0 | case TX_16X16: |
512 | 0 | vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride); |
513 | 0 | vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, |
514 | 0 | dqcoeff, pd->dequant[0], eob); |
515 | 0 | break; |
516 | 0 | case TX_8X8: |
517 | 0 | vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride); |
518 | 0 | vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, |
519 | 0 | dqcoeff, pd->dequant[0], eob); |
520 | 0 | break; |
521 | 0 | default: |
522 | 0 | assert(tx_size == TX_4X4); |
523 | 0 | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
524 | 0 | vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, |
525 | 0 | dqcoeff, pd->dequant[0], eob); |
526 | 0 | break; |
527 | 0 | } |
528 | 0 | return; |
529 | 0 | } |
530 | 313k | #endif // CONFIG_VP9_HIGHBITDEPTH |
531 | | |
532 | 313k | switch (tx_size) { |
533 | 4.27k | case TX_32X32: |
534 | 4.27k | vpx_fdct32x32_1(src_diff, coeff, diff_stride); |
535 | 4.27k | vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff, |
536 | 4.27k | pd->dequant[0], eob); |
537 | 4.27k | break; |
538 | 7.67k | case TX_16X16: |
539 | 7.67k | vpx_fdct16x16_1(src_diff, coeff, diff_stride); |
540 | 7.67k | vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff, |
541 | 7.67k | pd->dequant[0], eob); |
542 | 7.67k | break; |
543 | 49.8k | case TX_8X8: |
544 | 49.8k | vpx_fdct8x8_1(src_diff, coeff, diff_stride); |
545 | 49.8k | vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff, |
546 | 49.8k | pd->dequant[0], eob); |
547 | 49.8k | break; |
548 | 251k | default: |
549 | 251k | assert(tx_size == TX_4X4); |
550 | 251k | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
551 | 251k | vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff, |
552 | 251k | pd->dequant[0], eob); |
553 | 251k | break; |
554 | 313k | } |
555 | 313k | } |
556 | | |
557 | | void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, |
558 | 48.7M | BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { |
559 | 48.7M | MACROBLOCKD *const xd = &x->e_mbd; |
560 | 48.7M | const struct macroblock_plane *const p = &x->plane[plane]; |
561 | 48.7M | const struct macroblockd_plane *const pd = &xd->plane[plane]; |
562 | 48.7M | const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; |
563 | 48.7M | tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); |
564 | 48.7M | tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); |
565 | 48.7M | tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
566 | 48.7M | uint16_t *const eob = &p->eobs[block]; |
567 | 48.7M | const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; |
568 | 48.7M | const int16_t *src_diff; |
569 | 48.7M | src_diff = &p->src_diff[4 * (row * diff_stride + col)]; |
570 | | // skip block condition should be handled before this is called. |
571 | 48.7M | assert(!x->skip_block); |
572 | | |
573 | 48.7M | #if CONFIG_VP9_HIGHBITDEPTH |
574 | 48.7M | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
575 | 0 | switch (tx_size) { |
576 | 0 | case TX_32X32: |
577 | 0 | highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); |
578 | 0 | vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, |
579 | 0 | scan_order); |
580 | 0 | break; |
581 | 0 | case TX_16X16: |
582 | 0 | vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); |
583 | 0 | vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, |
584 | 0 | scan_order); |
585 | 0 | break; |
586 | 0 | case TX_8X8: |
587 | 0 | vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); |
588 | 0 | vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, |
589 | 0 | scan_order); |
590 | 0 | break; |
591 | 0 | default: |
592 | 0 | assert(tx_size == TX_4X4); |
593 | 0 | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
594 | 0 | vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, |
595 | 0 | scan_order); |
596 | 0 | break; |
597 | 0 | } |
598 | 0 | return; |
599 | 0 | } |
600 | 48.7M | #endif // CONFIG_VP9_HIGHBITDEPTH |
601 | | |
602 | 48.7M | switch (tx_size) { |
603 | 731k | case TX_32X32: |
604 | 731k | fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); |
605 | 731k | vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, |
606 | 731k | scan_order); |
607 | 731k | break; |
608 | 3.12M | case TX_16X16: |
609 | 3.12M | vpx_fdct16x16(src_diff, coeff, diff_stride); |
610 | 3.12M | vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, |
611 | 3.12M | scan_order); |
612 | 3.12M | break; |
613 | 13.1M | case TX_8X8: |
614 | 13.1M | vpx_fdct8x8(src_diff, coeff, diff_stride); |
615 | 13.1M | vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, |
616 | 13.1M | scan_order); |
617 | 13.1M | break; |
618 | 31.7M | default: |
619 | 31.7M | assert(tx_size == TX_4X4); |
620 | 31.7M | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
621 | 31.7M | vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, |
622 | 31.7M | scan_order); |
623 | 31.7M | break; |
624 | 48.7M | } |
625 | 48.7M | } |
626 | | |
627 | | static void encode_block(int plane, int block, int row, int col, |
628 | 8.07M | BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { |
629 | 8.07M | struct encode_b_args *const args = arg; |
630 | | #if CONFIG_MISMATCH_DEBUG |
631 | | int mi_row = args->mi_row; |
632 | | int mi_col = args->mi_col; |
633 | | int output_enabled = args->output_enabled; |
634 | | #endif |
635 | 8.07M | MACROBLOCK *const x = args->x; |
636 | 8.07M | MACROBLOCKD *const xd = &x->e_mbd; |
637 | 8.07M | struct macroblock_plane *const p = &x->plane[plane]; |
638 | 8.07M | struct macroblockd_plane *const pd = &xd->plane[plane]; |
639 | 8.07M | tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
640 | 8.07M | uint8_t *dst; |
641 | 8.07M | ENTROPY_CONTEXT *a, *l; |
642 | 8.07M | dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; |
643 | 8.07M | a = &args->ta[col]; |
644 | 8.07M | l = &args->tl[row]; |
645 | | |
646 | | // TODO(jingning): per transformed block zero forcing only enabled for |
647 | | // luma component. will integrate chroma components as well. |
648 | 8.07M | if (x->zcoeff_blk[tx_size][block] && plane == 0) { |
649 | 2.62M | p->eobs[block] = 0; |
650 | 2.62M | *a = *l = 0; |
651 | | #if CONFIG_MISMATCH_DEBUG |
652 | | goto encode_block_end; |
653 | | #else |
654 | 2.62M | return; |
655 | 2.62M | #endif |
656 | 2.62M | } |
657 | | |
658 | 5.45M | if (!x->skip_recode) { |
659 | 5.45M | if (x->quant_fp) { |
660 | | // Encoding process for rtc mode |
661 | 0 | if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) { |
662 | | // skip forward transform |
663 | 0 | p->eobs[block] = 0; |
664 | 0 | *a = *l = 0; |
665 | | #if CONFIG_MISMATCH_DEBUG |
666 | | goto encode_block_end; |
667 | | #else |
668 | 0 | return; |
669 | 0 | #endif |
670 | 0 | } else { |
671 | 0 | vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size); |
672 | 0 | } |
673 | 5.45M | } else { |
674 | 5.45M | if (max_txsize_lookup[plane_bsize] == tx_size) { |
675 | 3.43M | int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1)); |
676 | 3.43M | if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) { |
677 | | // full forward transform and quantization |
678 | 3.43M | vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); |
679 | 3.43M | } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) { |
680 | | // fast path forward transform and quantization |
681 | 0 | vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size); |
682 | 0 | } else { |
683 | | // skip forward transform |
684 | 0 | p->eobs[block] = 0; |
685 | 0 | *a = *l = 0; |
686 | | #if CONFIG_MISMATCH_DEBUG |
687 | | goto encode_block_end; |
688 | | #else |
689 | 0 | return; |
690 | 0 | #endif |
691 | 0 | } |
692 | 3.43M | } else { |
693 | 2.02M | vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); |
694 | 2.02M | } |
695 | 5.45M | } |
696 | 5.45M | } |
697 | | |
698 | 5.45M | if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { |
699 | 0 | const int ctx = combine_entropy_contexts(*a, *l); |
700 | 0 | *a = *l = vp9_optimize_b(x, plane, block, tx_size, ctx) > 0; |
701 | 5.45M | } else { |
702 | 5.45M | *a = *l = p->eobs[block] > 0; |
703 | 5.45M | } |
704 | | |
705 | 5.45M | if (p->eobs[block]) *(args->skip) = 0; |
706 | | |
707 | 5.45M | if (x->skip_encode || p->eobs[block] == 0) { |
708 | | #if CONFIG_MISMATCH_DEBUG |
709 | | goto encode_block_end; |
710 | | #else |
711 | 458k | return; |
712 | 458k | #endif |
713 | 458k | } |
714 | 4.99M | #if CONFIG_VP9_HIGHBITDEPTH |
715 | 4.99M | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
716 | 0 | uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); |
717 | 0 | switch (tx_size) { |
718 | 0 | case TX_32X32: |
719 | 0 | vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], |
720 | 0 | xd->bd); |
721 | 0 | break; |
722 | 0 | case TX_16X16: |
723 | 0 | vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], |
724 | 0 | xd->bd); |
725 | 0 | break; |
726 | 0 | case TX_8X8: |
727 | 0 | vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], |
728 | 0 | xd->bd); |
729 | 0 | break; |
730 | 0 | default: |
731 | 0 | assert(tx_size == TX_4X4); |
732 | | // this is like vp9_short_idct4x4 but has a special case around eob<=1 |
733 | | // which is significant (not just an optimization) for the lossless |
734 | | // case. |
735 | 0 | x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], |
736 | 0 | xd->bd); |
737 | 0 | break; |
738 | 0 | } |
739 | | #if CONFIG_MISMATCH_DEBUG |
740 | | goto encode_block_end; |
741 | | #else |
742 | 0 | return; |
743 | 0 | #endif |
744 | 0 | } |
745 | 4.99M | #endif // CONFIG_VP9_HIGHBITDEPTH |
746 | | |
747 | 4.99M | switch (tx_size) { |
748 | 6.06k | case TX_32X32: |
749 | 6.06k | vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); |
750 | 6.06k | break; |
751 | 43.5k | case TX_16X16: |
752 | 43.5k | vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); |
753 | 43.5k | break; |
754 | 314k | case TX_8X8: |
755 | 314k | vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); |
756 | 314k | break; |
757 | 4.63M | default: |
758 | 4.63M | assert(tx_size == TX_4X4); |
759 | | // this is like vp9_short_idct4x4 but has a special case around eob<=1 |
760 | | // which is significant (not just an optimization) for the lossless |
761 | | // case. |
762 | 4.63M | x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); |
763 | 4.63M | break; |
764 | 4.99M | } |
765 | | #if CONFIG_MISMATCH_DEBUG |
766 | | encode_block_end: |
767 | | if (output_enabled) { |
768 | | int pixel_c, pixel_r; |
769 | | int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); |
770 | | int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); |
771 | | mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, |
772 | | pd->subsampling_x, pd->subsampling_y); |
773 | | mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, |
774 | | blk_w, blk_h, |
775 | | xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); |
776 | | } |
777 | | #endif |
778 | 4.99M | } |
779 | | |
780 | | static void encode_block_pass1(int plane, int block, int row, int col, |
781 | | BLOCK_SIZE plane_bsize, TX_SIZE tx_size, |
782 | 0 | void *arg) { |
783 | 0 | MACROBLOCK *const x = (MACROBLOCK *)arg; |
784 | 0 | MACROBLOCKD *const xd = &x->e_mbd; |
785 | 0 | struct macroblock_plane *const p = &x->plane[plane]; |
786 | 0 | struct macroblockd_plane *const pd = &xd->plane[plane]; |
787 | 0 | tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
788 | 0 | uint8_t *dst; |
789 | 0 | dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; |
790 | |
|
791 | 0 | vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); |
792 | |
|
793 | 0 | if (p->eobs[block] > 0) { |
794 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
795 | 0 | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
796 | 0 | x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, |
797 | 0 | p->eobs[block], xd->bd); |
798 | 0 | return; |
799 | 0 | } |
800 | 0 | #endif // CONFIG_VP9_HIGHBITDEPTH |
801 | 0 | x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); |
802 | 0 | } |
803 | 0 | } |
804 | | |
805 | 0 | void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { |
806 | 0 | vp9_subtract_plane(x, bsize, 0); |
807 | 0 | vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, |
808 | 0 | encode_block_pass1, x); |
809 | 0 | } |
810 | | |
811 | | void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, |
812 | 1.81M | int output_enabled) { |
813 | 1.81M | MACROBLOCKD *const xd = &x->e_mbd; |
814 | 1.81M | struct optimize_ctx ctx; |
815 | 1.81M | MODE_INFO *mi = xd->mi[0]; |
816 | 1.81M | int plane; |
817 | | #if CONFIG_MISMATCH_DEBUG |
818 | | struct encode_b_args arg = { x, |
819 | | 1, // enable_trellis_opt |
820 | | 0.0, // trellis_opt_thresh |
821 | | NULL, // &sse_calc_done |
822 | | NULL, // &sse |
823 | | NULL, // above entropy context |
824 | | NULL, // left entropy context |
825 | | &mi->skip, mi_row, mi_col, output_enabled }; |
826 | | #else |
827 | 1.81M | struct encode_b_args arg = { x, |
828 | 1.81M | 1, // enable_trellis_opt |
829 | 1.81M | 0.0, // trellis_opt_thresh |
830 | 1.81M | NULL, // &sse_calc_done |
831 | 1.81M | NULL, // &sse |
832 | 1.81M | NULL, // above entropy context |
833 | 1.81M | NULL, // left entropy context |
834 | 1.81M | &mi->skip }; |
835 | 1.81M | (void)mi_row; |
836 | 1.81M | (void)mi_col; |
837 | 1.81M | (void)output_enabled; |
838 | 1.81M | #endif |
839 | | |
840 | 1.81M | mi->skip = 1; |
841 | | |
842 | 1.81M | if (x->skip) return; |
843 | | |
844 | 5.82M | for (plane = 0; plane < MAX_MB_PLANE; ++plane) { |
845 | 4.36M | if (!x->skip_recode) vp9_subtract_plane(x, bsize, plane); |
846 | | |
847 | 4.36M | if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { |
848 | 0 | const struct macroblockd_plane *const pd = &xd->plane[plane]; |
849 | 0 | const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; |
850 | 0 | vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], |
851 | 0 | ctx.tl[plane]); |
852 | 0 | arg.enable_trellis_opt = 1; |
853 | 4.36M | } else { |
854 | 4.36M | arg.enable_trellis_opt = 0; |
855 | 4.36M | } |
856 | 4.36M | arg.ta = ctx.ta[plane]; |
857 | 4.36M | arg.tl = ctx.tl[plane]; |
858 | | |
859 | 4.36M | vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, |
860 | 4.36M | &arg); |
861 | 4.36M | } |
862 | 1.45M | } |
863 | | |
864 | | void vp9_encode_block_intra(int plane, int block, int row, int col, |
865 | | BLOCK_SIZE plane_bsize, TX_SIZE tx_size, |
866 | 240M | void *arg) { |
867 | 240M | struct encode_b_args *const args = arg; |
868 | 240M | MACROBLOCK *const x = args->x; |
869 | 240M | MACROBLOCKD *const xd = &x->e_mbd; |
870 | 240M | MODE_INFO *mi = xd->mi[0]; |
871 | 240M | struct macroblock_plane *const p = &x->plane[plane]; |
872 | 240M | struct macroblockd_plane *const pd = &xd->plane[plane]; |
873 | 240M | tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block); |
874 | 240M | tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); |
875 | 240M | tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
876 | 240M | const ScanOrder *scan_order; |
877 | 240M | TX_TYPE tx_type = DCT_DCT; |
878 | 240M | PREDICTION_MODE mode; |
879 | 240M | const int bwl = b_width_log2_lookup[plane_bsize]; |
880 | 240M | const int diff_stride = 4 * (1 << bwl); |
881 | 240M | uint8_t *src, *dst; |
882 | 240M | int16_t *src_diff; |
883 | 240M | uint16_t *eob = &p->eobs[block]; |
884 | 240M | const int src_stride = p->src.stride; |
885 | 240M | const int dst_stride = pd->dst.stride; |
886 | 240M | int enable_trellis_opt = !x->skip_recode; |
887 | 240M | ENTROPY_CONTEXT *a = NULL; |
888 | 240M | ENTROPY_CONTEXT *l = NULL; |
889 | 240M | int entropy_ctx = 0; |
890 | 240M | dst = &pd->dst.buf[4 * (row * dst_stride + col)]; |
891 | 240M | src = &p->src.buf[4 * (row * src_stride + col)]; |
892 | 240M | src_diff = &p->src_diff[4 * (row * diff_stride + col)]; |
893 | | |
894 | 240M | if (tx_size == TX_4X4) { |
895 | 177M | tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block); |
896 | 177M | scan_order = &vp9_scan_orders[TX_4X4][tx_type]; |
897 | 177M | mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mi->uv_mode; |
898 | 177M | } else { |
899 | 62.9M | mode = plane == 0 ? mi->mode : mi->uv_mode; |
900 | 62.9M | if (tx_size == TX_32X32) { |
901 | 3.06M | scan_order = &vp9_default_scan_orders[TX_32X32]; |
902 | 59.8M | } else { |
903 | 59.8M | tx_type = get_tx_type(get_plane_type(plane), xd); |
904 | 59.8M | scan_order = &vp9_scan_orders[tx_size][tx_type]; |
905 | 59.8M | } |
906 | 62.9M | } |
907 | | |
908 | 240M | vp9_predict_intra_block( |
909 | 240M | xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst, |
910 | 240M | (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst, |
911 | 240M | dst_stride, col, row, plane); |
912 | | |
913 | | // skip block condition should be handled before this is called. |
914 | 240M | assert(!x->skip_block); |
915 | | |
916 | 240M | if (!x->skip_recode) { |
917 | 240M | const int tx_size_in_pixels = (1 << tx_size) << 2; |
918 | 240M | #if CONFIG_VP9_HIGHBITDEPTH |
919 | 240M | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
920 | 0 | vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, |
921 | 0 | diff_stride, src, src_stride, dst, dst_stride, |
922 | 0 | xd->bd); |
923 | 240M | } else { |
924 | 240M | vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, |
925 | 240M | diff_stride, src, src_stride, dst, dst_stride); |
926 | 240M | } |
927 | | #else |
928 | | vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, |
929 | | diff_stride, src, src_stride, dst, dst_stride); |
930 | | #endif |
931 | 240M | enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col, |
932 | 240M | plane_bsize, tx_size, args); |
933 | 240M | } |
934 | | |
935 | 240M | if (enable_trellis_opt) { |
936 | 19.4M | a = &args->ta[col]; |
937 | 19.4M | l = &args->tl[row]; |
938 | 19.4M | entropy_ctx = combine_entropy_contexts(*a, *l); |
939 | 19.4M | } |
940 | | |
941 | 240M | #if CONFIG_VP9_HIGHBITDEPTH |
942 | 240M | if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
943 | 0 | uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); |
944 | 0 | switch (tx_size) { |
945 | 0 | case TX_32X32: |
946 | 0 | if (!x->skip_recode) { |
947 | 0 | highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); |
948 | 0 | vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, |
949 | 0 | eob, scan_order); |
950 | 0 | } |
951 | 0 | if (enable_trellis_opt) { |
952 | 0 | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
953 | 0 | } |
954 | 0 | if (!x->skip_encode && *eob) { |
955 | 0 | vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); |
956 | 0 | } |
957 | 0 | break; |
958 | 0 | case TX_16X16: |
959 | 0 | if (!x->skip_recode) { |
960 | 0 | if (tx_type == DCT_DCT) |
961 | 0 | vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); |
962 | 0 | else |
963 | 0 | vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); |
964 | 0 | vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, |
965 | 0 | eob, scan_order); |
966 | 0 | } |
967 | 0 | if (enable_trellis_opt) { |
968 | 0 | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
969 | 0 | } |
970 | 0 | if (!x->skip_encode && *eob) { |
971 | 0 | vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob, |
972 | 0 | xd->bd); |
973 | 0 | } |
974 | 0 | break; |
975 | 0 | case TX_8X8: |
976 | 0 | if (!x->skip_recode) { |
977 | 0 | if (tx_type == DCT_DCT) |
978 | 0 | vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); |
979 | 0 | else |
980 | 0 | vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); |
981 | 0 | vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, |
982 | 0 | scan_order); |
983 | 0 | } |
984 | 0 | if (enable_trellis_opt) { |
985 | 0 | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
986 | 0 | } |
987 | 0 | if (!x->skip_encode && *eob) { |
988 | 0 | vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob, |
989 | 0 | xd->bd); |
990 | 0 | } |
991 | 0 | break; |
992 | 0 | default: |
993 | 0 | assert(tx_size == TX_4X4); |
994 | 0 | if (!x->skip_recode) { |
995 | 0 | if (tx_type != DCT_DCT) |
996 | 0 | vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); |
997 | 0 | else |
998 | 0 | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
999 | 0 | vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, |
1000 | 0 | scan_order); |
1001 | 0 | } |
1002 | 0 | if (enable_trellis_opt) { |
1003 | 0 | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
1004 | 0 | } |
1005 | 0 | if (!x->skip_encode && *eob) { |
1006 | 0 | if (tx_type == DCT_DCT) { |
1007 | | // this is like vp9_short_idct4x4 but has a special case around |
1008 | | // eob<=1 which is significant (not just an optimization) for the |
1009 | | // lossless case. |
1010 | 0 | x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); |
1011 | 0 | } else { |
1012 | 0 | vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, |
1013 | 0 | xd->bd); |
1014 | 0 | } |
1015 | 0 | } |
1016 | 0 | break; |
1017 | 0 | } |
1018 | 0 | if (*eob) *(args->skip) = 0; |
1019 | 0 | return; |
1020 | 0 | } |
1021 | 240M | #endif // CONFIG_VP9_HIGHBITDEPTH |
1022 | | |
1023 | 240M | switch (tx_size) { |
1024 | 3.06M | case TX_32X32: |
1025 | 3.06M | if (!x->skip_recode) { |
1026 | 3.06M | fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); |
1027 | 3.06M | vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, |
1028 | 3.06M | scan_order); |
1029 | 3.06M | } |
1030 | 3.06M | if (enable_trellis_opt) { |
1031 | 493k | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
1032 | 493k | } |
1033 | 3.06M | if (!x->skip_encode && *eob) |
1034 | 2.04M | vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob); |
1035 | 3.06M | break; |
1036 | 10.7M | case TX_16X16: |
1037 | 10.7M | if (!x->skip_recode) { |
1038 | 10.7M | vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); |
1039 | 10.7M | vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, |
1040 | 10.7M | scan_order); |
1041 | 10.7M | } |
1042 | 10.7M | if (enable_trellis_opt) { |
1043 | 910k | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
1044 | 910k | } |
1045 | 10.7M | if (!x->skip_encode && *eob) |
1046 | 8.48M | vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob); |
1047 | 10.7M | break; |
1048 | 49.1M | case TX_8X8: |
1049 | 49.1M | if (!x->skip_recode) { |
1050 | 49.1M | vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); |
1051 | 49.1M | vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, |
1052 | 49.1M | scan_order); |
1053 | 49.1M | } |
1054 | 49.1M | if (enable_trellis_opt) { |
1055 | 3.77M | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
1056 | 3.77M | } |
1057 | 49.1M | if (!x->skip_encode && *eob) |
1058 | 38.8M | vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); |
1059 | 49.1M | break; |
1060 | 177M | default: |
1061 | 177M | assert(tx_size == TX_4X4); |
1062 | 177M | if (!x->skip_recode) { |
1063 | 177M | if (tx_type != DCT_DCT) |
1064 | 15.9M | vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); |
1065 | 161M | else |
1066 | 161M | x->fwd_txfm4x4(src_diff, coeff, diff_stride); |
1067 | 177M | vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, |
1068 | 177M | scan_order); |
1069 | 177M | } |
1070 | 177M | if (enable_trellis_opt) { |
1071 | 14.2M | *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; |
1072 | 14.2M | } |
1073 | 177M | if (!x->skip_encode && *eob) { |
1074 | 132M | if (tx_type == DCT_DCT) |
1075 | | // this is like vp9_short_idct4x4 but has a special case around eob<=1 |
1076 | | // which is significant (not just an optimization) for the lossless |
1077 | | // case. |
1078 | 120M | x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob); |
1079 | 12.6M | else |
1080 | 12.6M | vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); |
1081 | 132M | } |
1082 | 177M | break; |
1083 | 240M | } |
1084 | 240M | if (*eob) *(args->skip) = 0; |
1085 | 240M | } |
1086 | | |
1087 | | void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, |
1088 | 13.4M | int enable_trellis_opt) { |
1089 | 13.4M | const MACROBLOCKD *const xd = &x->e_mbd; |
1090 | 13.4M | struct optimize_ctx ctx; |
1091 | | #if CONFIG_MISMATCH_DEBUG |
1092 | | // TODO(angiebird): make mismatch_debug support intra mode |
1093 | | struct encode_b_args arg = { |
1094 | | x, |
1095 | | enable_trellis_opt, |
1096 | | 0.0, // trellis_opt_thresh |
1097 | | NULL, // &sse_calc_done |
1098 | | NULL, // &sse |
1099 | | ctx.ta[plane], |
1100 | | ctx.tl[plane], |
1101 | | &xd->mi[0]->skip, |
1102 | | 0, // mi_row |
1103 | | 0, // mi_col |
1104 | | 0 // output_enabled |
1105 | | }; |
1106 | | #else |
1107 | 13.4M | struct encode_b_args arg = { x, |
1108 | 13.4M | enable_trellis_opt, |
1109 | 13.4M | 0.0, // trellis_opt_thresh |
1110 | 13.4M | NULL, // &sse_calc_done |
1111 | 13.4M | NULL, // &sse |
1112 | 13.4M | ctx.ta[plane], |
1113 | 13.4M | ctx.tl[plane], |
1114 | 13.4M | &xd->mi[0]->skip }; |
1115 | 13.4M | #endif |
1116 | | |
1117 | 13.4M | if (enable_trellis_opt && x->optimize && |
1118 | 0 | (!x->skip_recode || !x->skip_optimize)) { |
1119 | 0 | const struct macroblockd_plane *const pd = &xd->plane[plane]; |
1120 | 0 | const TX_SIZE tx_size = |
1121 | 0 | plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size; |
1122 | 0 | vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); |
1123 | 13.4M | } else { |
1124 | 13.4M | arg.enable_trellis_opt = 0; |
1125 | 13.4M | } |
1126 | | |
1127 | 13.4M | vp9_foreach_transformed_block_in_plane(xd, bsize, plane, |
1128 | 13.4M | vp9_encode_block_intra, &arg); |
1129 | 13.4M | } |