/work/svt-av1/Source/Lib/Codec/enc_cdef.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 3-Clause Clear License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
10 | | */ |
11 | | #include <stdio.h> |
12 | | #include <stdlib.h> |
13 | | #include <math.h> |
14 | | #include <string.h> |
15 | | |
16 | | #include "enc_cdef.h" |
17 | | #include <stdint.h> |
18 | | #include "aom_dsp_rtcd.h" |
19 | | #include "svt_log.h" |
20 | | #include "rd_cost.h" |
21 | | #include "rc_process.h" |
22 | | |
23 | | static INLINE uint64_t mse_8xn_16bit_c(const uint16_t* src, const uint16_t* dst, const int32_t dstride, |
24 | 0 | const int32_t height, uint8_t subsampling_factor) { |
25 | 0 | uint64_t sum = 0; |
26 | 0 | int32_t i, j; |
27 | 0 | for (i = 0; i < height; i += subsampling_factor) { |
28 | 0 | for (j = 0; j < 8; j++) { |
29 | 0 | int32_t e = dst[i * dstride + j] - src[8 * i + j]; |
30 | 0 | sum += e * e; |
31 | 0 | } |
32 | 0 | } |
33 | 0 | return sum; |
34 | 0 | } |
35 | | |
36 | | static INLINE uint64_t mse_4xn_16bit_c(const uint16_t* src, const uint16_t* dst, const int32_t dstride, |
37 | 0 | const int32_t height, uint8_t subsampling_factor) { |
38 | 0 | uint64_t sum = 0; |
39 | 0 | int32_t i, j; |
40 | 0 | for (i = 0; i < height; i += subsampling_factor) { |
41 | 0 | for (j = 0; j < 4; j++) { |
42 | 0 | int32_t e = dst[i * dstride + j] - src[4 * i + j]; |
43 | 0 | sum += e * e; |
44 | 0 | } |
45 | 0 | } |
46 | 0 | return sum; |
47 | 0 | } |
48 | | |
49 | | static INLINE uint64_t mse_8xn_8bit_c(const uint8_t* src, const uint8_t* dst, const int32_t dstride, |
50 | 0 | const int32_t height, uint8_t subsampling_factor) { |
51 | 0 | uint64_t sum = 0; |
52 | 0 | int32_t i, j; |
53 | 0 | for (i = 0; i < height; i += subsampling_factor) { |
54 | 0 | for (j = 0; j < 8; j++) { |
55 | 0 | int32_t e = dst[i * dstride + j] - src[8 * i + j]; |
56 | 0 | sum += e * e; |
57 | 0 | } |
58 | 0 | } |
59 | 0 | return sum; |
60 | 0 | } |
61 | | |
62 | | static INLINE uint64_t mse_4xn_8bit_c(const uint8_t* src, const uint8_t* dst, const int32_t dstride, |
63 | 0 | const int32_t height, uint8_t subsampling_factor) { |
64 | 0 | uint64_t sum = 0; |
65 | 0 | int32_t i, j; |
66 | 0 | for (i = 0; i < height; i += subsampling_factor) { |
67 | 0 | for (j = 0; j < 4; j++) { |
68 | 0 | int32_t e = dst[i * dstride + j] - src[4 * i + j]; |
69 | 0 | sum += e * e; |
70 | 0 | } |
71 | 0 | } |
72 | 0 | return sum; |
73 | 0 | } |
74 | | |
75 | | /* Compute MSE only on the blocks we filtered. */ |
76 | | uint64_t svt_aom_compute_cdef_dist_16bit_c(const uint16_t* dst, int32_t dstride, const uint16_t* src, |
77 | | const CdefList* dlist, int32_t cdef_count, BlockSize bsize, |
78 | 0 | int32_t coeff_shift, uint8_t subsampling_factor) { |
79 | 0 | uint64_t sum = 0; |
80 | 0 | int32_t bi, bx, by; |
81 | 0 | if (bsize == BLOCK_8X8) { |
82 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
83 | 0 | by = dlist[bi].by; |
84 | 0 | bx = dlist[bi].bx; |
85 | 0 | sum += mse_8xn_16bit_c( |
86 | 0 | &src[bi << (3 + 3)], &dst[(by << 3) * dstride + (bx << 3)], dstride, 8, subsampling_factor); |
87 | 0 | } |
88 | 0 | } else if (bsize == BLOCK_4X8) { |
89 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
90 | 0 | by = dlist[bi].by; |
91 | 0 | bx = dlist[bi].bx; |
92 | 0 | sum += mse_4xn_16bit_c( |
93 | 0 | &src[bi << (3 + 2)], &dst[(by << 3) * dstride + (bx << 2)], dstride, 8, subsampling_factor); |
94 | 0 | } |
95 | 0 | } else if (bsize == BLOCK_8X4) { |
96 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
97 | 0 | by = dlist[bi].by; |
98 | 0 | bx = dlist[bi].bx; |
99 | 0 | sum += mse_8xn_16bit_c( |
100 | 0 | &src[bi << (2 + 3)], &dst[(by << 2) * dstride + (bx << 3)], dstride, 4, subsampling_factor); |
101 | 0 | } |
102 | 0 | } else { |
103 | 0 | assert(bsize == BLOCK_4X4); |
104 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
105 | 0 | by = dlist[bi].by; |
106 | 0 | bx = dlist[bi].bx; |
107 | 0 | sum += mse_4xn_16bit_c( |
108 | 0 | &src[bi << (2 + 2)], &dst[(by << 2) * dstride + (bx << 2)], dstride, 4, subsampling_factor); |
109 | 0 | } |
110 | 0 | } |
111 | 0 | return sum >> 2 * coeff_shift; |
112 | 0 | } |
113 | | |
114 | | uint64_t svt_aom_compute_cdef_dist_8bit_c(const uint8_t* dst8, int32_t dstride, const uint8_t* src8, |
115 | | const CdefList* dlist, int32_t cdef_count, BlockSize bsize, |
116 | 0 | int32_t coeff_shift, uint8_t subsampling_factor) { |
117 | 0 | uint64_t sum = 0; |
118 | 0 | int32_t bi, bx, by; |
119 | 0 | if (bsize == BLOCK_8X8) { |
120 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
121 | 0 | by = dlist[bi].by; |
122 | 0 | bx = dlist[bi].bx; |
123 | 0 | sum += mse_8xn_8bit_c( |
124 | 0 | &src8[bi << (3 + 3)], &dst8[(by << 3) * dstride + (bx << 3)], dstride, 8, subsampling_factor); |
125 | 0 | } |
126 | 0 | } else if (bsize == BLOCK_4X8) { |
127 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
128 | 0 | by = dlist[bi].by; |
129 | 0 | bx = dlist[bi].bx; |
130 | 0 | sum += mse_4xn_8bit_c( |
131 | 0 | &src8[bi << (3 + 2)], &dst8[(by << 3) * dstride + (bx << 2)], dstride, 8, subsampling_factor); |
132 | 0 | } |
133 | 0 | } else if (bsize == BLOCK_8X4) { |
134 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
135 | 0 | by = dlist[bi].by; |
136 | 0 | bx = dlist[bi].bx; |
137 | 0 | sum += mse_8xn_8bit_c( |
138 | 0 | &src8[bi << (2 + 3)], &dst8[(by << 2) * dstride + (bx << 3)], dstride, 4, subsampling_factor); |
139 | 0 | } |
140 | 0 | } else { |
141 | 0 | assert(bsize == BLOCK_4X4); |
142 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
143 | 0 | by = dlist[bi].by; |
144 | 0 | bx = dlist[bi].bx; |
145 | 0 | sum += mse_4xn_8bit_c( |
146 | 0 | &src8[bi << (2 + 2)], &dst8[(by << 2) * dstride + (bx << 2)], dstride, 4, subsampling_factor); |
147 | 0 | } |
148 | 0 | } |
149 | 0 | return sum >> 2 * coeff_shift; |
150 | 0 | } |
151 | | |
152 | 3.55k | static int32_t svt_sb_all_skip(PictureControlSet* pcs, const Av1Common* const cm, int32_t mi_row, int32_t mi_col) { |
153 | 3.55k | int32_t maxc, maxr; |
154 | 3.55k | maxc = cm->mi_cols - mi_col; |
155 | 3.55k | maxr = cm->mi_rows - mi_row; |
156 | | |
157 | 3.55k | maxr = AOMMIN(maxr, MI_SIZE_64X64); |
158 | 3.55k | maxc = AOMMIN(maxc, MI_SIZE_64X64); |
159 | | |
160 | 8.13k | for (int32_t r = 0; r < maxr; r++) { |
161 | 70.0k | for (int32_t c = 0; c < maxc; c++) { |
162 | 65.4k | if (!(pcs->mi_grid_base[(mi_row + r) * pcs->mi_stride + mi_col + c]->block_mi.skip)) { |
163 | 3.21k | return 0; |
164 | 3.21k | } |
165 | 65.4k | } |
166 | 7.79k | } |
167 | 336 | return 1; |
168 | 3.55k | } |
169 | | |
170 | | int32_t svt_sb_compute_cdef_list(PictureControlSet* pcs, const Av1Common* const cm, int32_t mi_row, int32_t mi_col, |
171 | 0 | CdefList* dlist, BlockSize bs) { |
172 | 0 | MbModeInfo** grid = pcs->mi_grid_base; |
173 | 0 | int32_t mi_stride = pcs->mi_stride; |
174 | |
|
175 | 0 | int32_t maxc = cm->mi_cols - mi_col; |
176 | 0 | int32_t maxr = cm->mi_rows - mi_row; |
177 | |
|
178 | 0 | if (bs == BLOCK_128X128 || bs == BLOCK_128X64) { |
179 | 0 | maxc = AOMMIN(maxc, MI_SIZE_128X128); |
180 | 0 | } else { |
181 | 0 | maxc = AOMMIN(maxc, MI_SIZE_64X64); |
182 | 0 | } |
183 | 0 | if (bs == BLOCK_128X128 || bs == BLOCK_64X128) { |
184 | 0 | maxr = AOMMIN(maxr, MI_SIZE_128X128); |
185 | 0 | } else { |
186 | 0 | maxr = AOMMIN(maxr, MI_SIZE_64X64); |
187 | 0 | } |
188 | |
|
189 | 0 | const int32_t r_step = mi_size_high[BLOCK_8X8]; |
190 | 0 | const int32_t c_step = mi_size_wide[BLOCK_8X8]; |
191 | 0 | const int32_t r_shift = (r_step == 2); |
192 | 0 | const int32_t c_shift = (c_step == 2); |
193 | |
|
194 | 0 | assert(r_step == 1 || r_step == 2); |
195 | 0 | assert(c_step == 1 || c_step == 2); |
196 | |
|
197 | 0 | int32_t count = 0; |
198 | 0 | for (int32_t r = 0; r < maxr; r += r_step) { |
199 | 0 | for (int32_t c = 0; c < maxc; c += c_step) { |
200 | 0 | if (!grid[(mi_row + r) * mi_stride + (mi_col + c)]->block_mi.skip || |
201 | 0 | !grid[(mi_row + r) * mi_stride + (mi_col + c + 1)]->block_mi.skip || |
202 | 0 | !grid[(mi_row + r + 1) * mi_stride + (mi_col + c)]->block_mi.skip || |
203 | 0 | !grid[(mi_row + r + 1) * mi_stride + (mi_col + c + 1)]->block_mi.skip) { |
204 | 0 | dlist[count].by = (uint8_t)(r >> r_shift); |
205 | 0 | dlist[count].bx = (uint8_t)(c >> c_shift); |
206 | 0 | count++; |
207 | 0 | } |
208 | 0 | } |
209 | 0 | } |
210 | 0 | return count; |
211 | 0 | } |
212 | | |
213 | 0 | static inline void svt_aom_fill_rect(uint16_t* dst, int32_t dstride, int32_t v, int32_t h, uint16_t x) { |
214 | 0 | for (int32_t i = 0; i < v; i++) { |
215 | 0 | for (int32_t j = 0; j < h; j++) { |
216 | 0 | dst[i * dstride + j] = x; |
217 | 0 | } |
218 | 0 | } |
219 | 0 | } |
220 | | |
221 | | static inline void svt_aom_copy_rect(uint16_t* dst, int32_t dstride, const uint16_t* src, int32_t sstride, int32_t v, |
222 | 0 | int32_t h) { |
223 | 0 | for (int32_t i = 0; i < v; i++) { |
224 | 0 | svt_memcpy(dst, src, sizeof(dst[0]) * h); |
225 | 0 | dst += dstride; |
226 | 0 | src += sstride; |
227 | 0 | } |
228 | 0 | } |
229 | | |
230 | | /* |
231 | | Loop over all 64x64 filter blocks and perform the CDEF filtering for each block, using |
232 | | the filter strength pairs chosen in finish_cdef_search(). |
233 | | */ |
234 | 0 | void svt_av1_cdef_frame(SequenceControlSet* scs, PictureControlSet* pcs) { |
235 | 0 | PictureParentControlSet* ppcs = pcs->ppcs; |
236 | 0 | Av1Common* cm = ppcs->av1_cm; |
237 | 0 | FrameHeader* frm_hdr = &ppcs->frm_hdr; |
238 | 0 | bool is_16bit = scs->is_16bit_pipeline; |
239 | |
|
240 | 0 | EbPictureBufferDesc* recon_pic; |
241 | 0 | svt_aom_get_recon_pic(pcs, &recon_pic, is_16bit); |
242 | |
|
243 | 0 | const int32_t num_planes = av1_num_planes(&scs->seq_header.color_config); |
244 | 0 | DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]); |
245 | 0 | uint16_t* linebuf[3]; |
246 | 0 | uint16_t* colbuf[3]; |
247 | 0 | CdefList dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; |
248 | 0 | uint8_t * row_cdef, *prev_row_cdef, *curr_row_cdef; |
249 | 0 | int32_t cdef_count; |
250 | 0 | const uint32_t sb_size = scs->super_block_size; |
251 | 0 | int32_t mi_wide_l2[3]; |
252 | 0 | int32_t mi_high_l2[3]; |
253 | 0 | int32_t xdec[3]; |
254 | 0 | int32_t ydec[3]; |
255 | 0 | int32_t coeff_shift = AOMMAX(scs->static_config.encoder_bit_depth - 8, 0); |
256 | 0 | const int32_t nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; |
257 | 0 | const int32_t nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; |
258 | 0 | const uint32_t cdef_size = sizeof(*row_cdef) * (nhfb + 2) * 2; |
259 | |
|
260 | 0 | row_cdef = (uint8_t*)svt_aom_malloc(cdef_size); |
261 | 0 | assert(row_cdef != NULL); |
262 | 0 | memset(row_cdef, 1, cdef_size); |
263 | 0 | prev_row_cdef = row_cdef + 1; |
264 | 0 | curr_row_cdef = prev_row_cdef + nhfb + 2; |
265 | 0 | for (int32_t pli = 0; pli < num_planes; pli++) { |
266 | 0 | int32_t subsampling_x = (pli == 0) ? 0 : 1; |
267 | 0 | int32_t subsampling_y = (pli == 0) ? 0 : 1; |
268 | 0 | xdec[pli] = subsampling_x; //CHKN xd->plane[pli].subsampling_x; |
269 | 0 | ydec[pli] = subsampling_y; //CHKN xd->plane[pli].subsampling_y; |
270 | 0 | mi_wide_l2[pli] = MI_SIZE_LOG2 - subsampling_x; //CHKN xd->plane[pli].subsampling_x; |
271 | 0 | mi_high_l2[pli] = MI_SIZE_LOG2 - subsampling_y; //CHKN xd->plane[pli].subsampling_y; |
272 | 0 | } |
273 | |
|
274 | 0 | const int32_t stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER; |
275 | 0 | for (int32_t pli = 0; pli < num_planes; pli++) { |
276 | 0 | linebuf[pli] = (uint16_t*)svt_aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride); |
277 | 0 | colbuf[pli] = (uint16_t*)svt_aom_malloc( |
278 | 0 | sizeof(*colbuf) * ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) * CDEF_HBORDER); |
279 | 0 | } |
280 | 0 | #if OPT_CDEF_SKIP_CHROMA_BORDER |
281 | | // Frame-level check: if every UV strength entry is 0, no chroma block |
282 | | // will ever be filtered. In that case skip all chroma border copies |
283 | | // (including linebuf/colbuf saves) for the entire frame |
284 | 0 | bool chroma_filter_off = (num_planes > 1); |
285 | 0 | if (chroma_filter_off) { |
286 | 0 | for (int32_t i = 0; i < ppcs->nb_cdef_strengths; i++) { |
287 | 0 | if (frm_hdr->cdef_params.cdef_uv_strength[i] != 0) { |
288 | 0 | chroma_filter_off = false; |
289 | 0 | break; |
290 | 0 | } |
291 | 0 | } |
292 | 0 | } |
293 | 0 | const int32_t active_planes = chroma_filter_off ? 1 : num_planes; |
294 | 0 | #endif |
295 | 0 | for (int32_t fbr = 0; fbr < nvfb; fbr++) { |
296 | 0 | int32_t cdef_left = 1; |
297 | 0 | for (int32_t fbc = 0; fbc < nhfb; fbc++) { |
298 | 0 | int32_t level, sec_strength; |
299 | 0 | int32_t uv_level, uv_sec_strength; |
300 | 0 | int32_t nhb, nvb; |
301 | 0 | int32_t cstart = 0; |
302 | 0 | curr_row_cdef[fbc] = 0; |
303 | 0 | assert(pcs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc] != NULL && |
304 | 0 | "CDEF ERROR: Skipping Current FB"); |
305 | 0 | assert(pcs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]->cdef_strength != -1 && |
306 | 0 | "CDEF ERROR: Skipping Current FB"); |
307 | 0 | if (!cdef_left) { |
308 | 0 | cstart = |
309 | 0 | -CDEF_HBORDER; //CHKN if the left block has not been filtered, then we can use samples on the left as input. |
310 | 0 | } |
311 | |
|
312 | 0 | nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc); |
313 | 0 | nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr); |
314 | 0 | int32_t frame_top, frame_left, frame_bottom, frame_right; |
315 | |
|
316 | 0 | int32_t mi_row = MI_SIZE_64X64 * fbr; |
317 | 0 | int32_t mi_col = MI_SIZE_64X64 * fbc; |
318 | | // for the current filter block, it's top left corner mi structure (mi_tl) |
319 | | // is first accessed to check whether the top and left boundaries are |
320 | | // frame boundaries. Then bottom-left and top-right mi structures are |
321 | | // accessed to check whether the bottom and right boundaries |
322 | | // (respectively) are frame boundaries. |
323 | | // |
324 | | // Note that we can't just check the bottom-right mi structure - eg. if |
325 | | // we're at the right-hand edge of the frame but not the bottom, then |
326 | | // the bottom-right mi is NULL but the bottom-left is not. |
327 | 0 | frame_top = (mi_row == 0) ? 1 : 0; |
328 | 0 | frame_left = (mi_col == 0) ? 1 : 0; |
329 | |
|
330 | 0 | if (fbr != nvfb - 1) { |
331 | 0 | frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0; |
332 | 0 | } else { |
333 | 0 | frame_bottom = 1; |
334 | 0 | } |
335 | |
|
336 | 0 | if (fbc != nhfb - 1) { |
337 | 0 | frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0; |
338 | 0 | } else { |
339 | 0 | frame_right = 1; |
340 | 0 | } |
341 | | |
342 | | // Find the index of the CDEF strength for the filter block |
343 | 0 | const int32_t mbmi_cdef_strength = |
344 | 0 | pcs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]->cdef_strength; |
345 | 0 | level = frm_hdr->cdef_params.cdef_y_strength[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; |
346 | 0 | sec_strength = frm_hdr->cdef_params.cdef_y_strength[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; |
347 | | // Secondary luma strength takes values in {0, 1, 2, 4}. If sec_strength is equal to 3 from the step above, change it to 4. |
348 | 0 | sec_strength += sec_strength == 3; |
349 | | // Set primary and secondary chroma strengths. |
350 | 0 | uv_level = frm_hdr->cdef_params.cdef_uv_strength[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; |
351 | 0 | uv_sec_strength = frm_hdr->cdef_params.cdef_uv_strength[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; |
352 | | // Secondary chroma strength takes values in {0, 1, 2, 4}. If sec_strength is equal to 3 from the step above, change it to 4. |
353 | 0 | uv_sec_strength += uv_sec_strength == 3; |
354 | 0 | if ((level == 0 && sec_strength == 0 && uv_level == 0 && uv_sec_strength == 0) || |
355 | 0 | (cdef_count = svt_sb_compute_cdef_list( |
356 | 0 | pcs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, BLOCK_64X64)) == 0) { |
357 | 0 | cdef_left = 0; |
358 | 0 | continue; |
359 | 0 | } |
360 | | |
361 | 0 | int dirinit = !(ppcs->cdef_search_ctrls.use_reference_cdef_fs || ppcs->cdef_search_ctrls.use_qp_strength); |
362 | | // When SB 128 is used, the search for certain blocks is skipped, so dir/var info is not generated |
363 | | // In those cases, must generate info here |
364 | 0 | if (sb_size == 128) { |
365 | 0 | const uint32_t lc = MI_SIZE_64X64 * fbc; |
366 | 0 | const uint32_t lr = MI_SIZE_64X64 * fbr; |
367 | 0 | const MbModeInfo* mbmi = pcs->mi_grid_base[lr * cm->mi_stride + lc]; |
368 | 0 | const BlockSize bsize = mbmi->bsize; |
369 | 0 | if (((fbc & 1) && (bsize == BLOCK_128X128 || bsize == BLOCK_128X64)) || |
370 | 0 | ((fbr & 1) && (bsize == BLOCK_128X128 || bsize == BLOCK_64X128))) { |
371 | 0 | dirinit = 0; |
372 | 0 | } |
373 | 0 | } |
374 | 0 | uint8_t (*dir)[CDEF_NBLOCKS][CDEF_NBLOCKS] = &pcs->cdef_dir_data[fbr * nhfb + fbc].dir; |
375 | 0 | int32_t (*var)[CDEF_NBLOCKS][CDEF_NBLOCKS] = &pcs->cdef_dir_data[fbr * nhfb + fbc].var; |
376 | 0 | curr_row_cdef[fbc] = 1; |
377 | 0 | #if OPT_CDEF_SKIP_CHROMA_BORDER |
378 | 0 | for (int32_t pli = 0; pli < active_planes; pli++) { |
379 | | #else |
380 | | for (int32_t pli = 0; pli < num_planes; pli++) { |
381 | | #endif |
382 | 0 | int32_t coffset; |
383 | 0 | int32_t rend, cend; |
384 | 0 | int32_t pri_damping = frm_hdr->cdef_params.cdef_damping; |
385 | 0 | int32_t sec_damping = pri_damping; |
386 | 0 | int32_t hsize = nhb << mi_wide_l2[pli]; |
387 | 0 | int32_t vsize = nvb << mi_high_l2[pli]; |
388 | 0 | if (fbc == nhfb - 1) { |
389 | 0 | cend = hsize; |
390 | 0 | } else { |
391 | 0 | cend = hsize + CDEF_HBORDER; |
392 | 0 | } |
393 | |
|
394 | 0 | if (fbr == nvfb - 1) { |
395 | 0 | rend = vsize; |
396 | 0 | } else { |
397 | 0 | rend = vsize + CDEF_VBORDER; |
398 | 0 | } |
399 | |
|
400 | 0 | coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli]; |
401 | 0 | EbByte rec_buff = recon_pic->buffer[pli]; |
402 | 0 | uint32_t rec_stride = recon_pic->stride[pli]; |
403 | 0 | if (pli) { |
404 | 0 | level = uv_level; |
405 | 0 | sec_strength = uv_sec_strength; |
406 | 0 | } |
407 | 0 | #if OPT_CDEF_PER_PLANE_SKIP |
408 | | // Per-plane elision: when this plane's strength is (0,0), the |
409 | | // filter is a no-op and the only purpose of the per-plane src[] |
410 | | // border-copy machinery is to feed colbuf[pli] / linebuf[pli] |
411 | | // for neighbours (right, below). Since the recon for this plane |
412 | | // is unmodified, we can source those saves directly from |
413 | | // rec_buff and skip the whole src[] dance. |
414 | | // |
415 | | // For luma (pli=0) we additionally require dirinit==1 so that |
416 | | // dir/var are already populated by the search; if dirinit==0 |
417 | | // (use_reference_cdef_fs / use_qp_strength), svt_cdef_filter_fb |
418 | | // must still run to populate dir for subsequent chroma planes. |
419 | 0 | if (level == 0 && sec_strength == 0 && (pli != 0 || dirinit) && fbc == nhfb - 1) { |
420 | | // Save linebuf[pli] (bottom edge for the FB below) from rec_buff. |
421 | 0 | if (fbr < nvfb - 1) { |
422 | 0 | svt_aom_copy_sb8_16(&linebuf[pli][coffset], |
423 | 0 | stride, |
424 | 0 | rec_buff, |
425 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER, |
426 | 0 | coffset, |
427 | 0 | rec_stride, |
428 | 0 | CDEF_VBORDER, |
429 | 0 | hsize, |
430 | 0 | is_16bit); |
431 | 0 | } |
432 | | // Save colbuf[pli] (right edge for the FB to the right) from rec_buff. |
433 | | // colbuf layout: rend+VBORDER rows x HBORDER cols, mirroring the |
434 | | // standard save which reads from src[] rows 0..rend+VBORDER-1 cols |
435 | | // hsize..hsize+HBORDER-1 (== rec_buff cols coffset+hsize-HBORDER..coffset+hsize-1). |
436 | | // |
437 | | // - For fbr > 0: copy the full rend+VBORDER rows starting VBORDER above the FB. |
438 | | // - For fbr == 0: the top VBORDER rows of colbuf are read by the right |
439 | | // neighbour but immediately overwritten by its frame_top fill, so we |
440 | | // skip them (rec_buff has no rows above 0). |
441 | 0 | if (fbc < nhfb - 1) { |
442 | 0 | const int32_t row_top = (fbr == 0) ? 0 : -CDEF_VBORDER; |
443 | 0 | const int32_t num_rows = (fbr == 0) ? rend : (rend + CDEF_VBORDER); |
444 | 0 | const int32_t dst_row_off = (fbr == 0) ? CDEF_VBORDER : 0; |
445 | 0 | svt_aom_copy_sb8_16(colbuf[pli] + dst_row_off * CDEF_HBORDER, |
446 | 0 | CDEF_HBORDER, |
447 | 0 | rec_buff, |
448 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr + row_top, |
449 | 0 | coffset + hsize - CDEF_HBORDER, |
450 | 0 | rec_stride, |
451 | 0 | num_rows, |
452 | 0 | CDEF_HBORDER, |
453 | 0 | is_16bit); |
454 | 0 | } |
455 | 0 | continue; |
456 | 0 | } |
457 | 0 | #endif |
458 | | |
459 | | /* Copy in the pixels we need from the current superblock for |
460 | | deringing.*/ |
461 | 0 | svt_aom_copy_sb8_16(&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], |
462 | 0 | CDEF_BSTRIDE, |
463 | 0 | rec_buff, |
464 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, |
465 | 0 | coffset + cstart, |
466 | 0 | rec_stride, |
467 | 0 | rend, |
468 | 0 | cend - cstart, |
469 | 0 | is_16bit); |
470 | 0 | if (!prev_row_cdef[fbc]) { |
471 | 0 | svt_aom_copy_sb8_16(&src[CDEF_HBORDER], |
472 | 0 | CDEF_BSTRIDE, |
473 | 0 | rec_buff, |
474 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER, |
475 | 0 | coffset, |
476 | 0 | rec_stride, |
477 | 0 | CDEF_VBORDER, |
478 | 0 | hsize, |
479 | 0 | is_16bit); |
480 | 0 | } else if (fbr > 0) { |
481 | 0 | svt_aom_copy_rect( |
482 | 0 | &src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset], stride, CDEF_VBORDER, hsize); |
483 | 0 | } else { |
484 | 0 | svt_aom_fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, CDEF_VERY_LARGE); |
485 | 0 | } |
486 | |
|
487 | 0 | if (!prev_row_cdef[fbc - 1]) { |
488 | 0 | svt_aom_copy_sb8_16(src, |
489 | 0 | CDEF_BSTRIDE, |
490 | 0 | rec_buff, |
491 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER, |
492 | 0 | coffset - CDEF_HBORDER, |
493 | 0 | rec_stride, |
494 | 0 | CDEF_VBORDER, |
495 | 0 | CDEF_HBORDER, |
496 | 0 | is_16bit); |
497 | 0 | } else if (fbr > 0 && fbc > 0) { |
498 | 0 | svt_aom_copy_rect( |
499 | 0 | src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER], stride, CDEF_VBORDER, CDEF_HBORDER); |
500 | 0 | } else { |
501 | 0 | svt_aom_fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); |
502 | 0 | } |
503 | |
|
504 | 0 | if (!prev_row_cdef[fbc + 1]) { |
505 | 0 | svt_aom_copy_sb8_16(&src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])], |
506 | 0 | CDEF_BSTRIDE, |
507 | 0 | rec_buff, |
508 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER, |
509 | 0 | coffset + hsize, |
510 | 0 | rec_stride, |
511 | 0 | CDEF_VBORDER, |
512 | 0 | CDEF_HBORDER, |
513 | 0 | is_16bit); |
514 | 0 | } else if (fbr > 0 && fbc < nhfb - 1) { |
515 | 0 | svt_aom_copy_rect(&src[hsize + CDEF_HBORDER], |
516 | 0 | CDEF_BSTRIDE, |
517 | 0 | &linebuf[pli][coffset + hsize], |
518 | 0 | stride, |
519 | 0 | CDEF_VBORDER, |
520 | 0 | CDEF_HBORDER); |
521 | 0 | } else { |
522 | 0 | svt_aom_fill_rect( |
523 | 0 | &src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); |
524 | 0 | } |
525 | |
|
526 | 0 | if (cdef_left) { |
527 | | /* If we deringed the superblock on the left then we need to copy in |
528 | | saved pixels. */ |
529 | 0 | svt_aom_copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER, rend + CDEF_VBORDER, CDEF_HBORDER); |
530 | 0 | } |
531 | | |
532 | | /* Saving pixels in case we need to dering the superblock on the |
533 | | right. */ |
534 | 0 | if (fbc < nhfb - 1) { |
535 | 0 | svt_aom_copy_rect( |
536 | 0 | colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, rend + CDEF_VBORDER, CDEF_HBORDER); |
537 | 0 | } |
538 | |
|
539 | 0 | if (fbr < nvfb - 1) { |
540 | 0 | svt_aom_copy_sb8_16(&linebuf[pli][coffset], |
541 | 0 | stride, |
542 | 0 | rec_buff, |
543 | 0 | (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER, |
544 | 0 | coffset, |
545 | 0 | rec_stride, |
546 | 0 | CDEF_VBORDER, |
547 | 0 | hsize, |
548 | 0 | is_16bit); |
549 | 0 | } |
550 | |
|
551 | 0 | if (frame_top) { |
552 | 0 | svt_aom_fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE); |
553 | 0 | } |
554 | 0 | if (frame_left) { |
555 | 0 | svt_aom_fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); |
556 | 0 | } |
557 | 0 | if (frame_bottom) { |
558 | 0 | svt_aom_fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], |
559 | 0 | CDEF_BSTRIDE, |
560 | 0 | CDEF_VBORDER, |
561 | 0 | hsize + 2 * CDEF_HBORDER, |
562 | 0 | CDEF_VERY_LARGE); |
563 | 0 | } |
564 | 0 | if (frame_right) { |
565 | 0 | svt_aom_fill_rect(&src[hsize + CDEF_HBORDER], |
566 | 0 | CDEF_BSTRIDE, |
567 | 0 | vsize + 2 * CDEF_VBORDER, |
568 | 0 | CDEF_HBORDER, |
569 | 0 | CDEF_VERY_LARGE); |
570 | 0 | } |
571 | | // if ppcs->cdef_ctrls.use_reference_cdef_fs is true, then search was not performed |
572 | | // Therefore, need to make sure dir and var are initialized |
573 | 0 | if (level || sec_strength || !dirinit) { |
574 | 0 | svt_cdef_filter_fb( |
575 | 0 | is_16bit ? NULL |
576 | 0 | : &rec_buff[rec_stride * (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + |
577 | 0 | (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])], |
578 | 0 | is_16bit ? &((uint16_t*)rec_buff)[rec_stride * (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + |
579 | 0 | (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])] |
580 | 0 | : NULL, |
581 | 0 | rec_stride, |
582 | 0 | &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], |
583 | 0 | xdec[pli], |
584 | 0 | ydec[pli], |
585 | 0 | *dir, |
586 | 0 | &dirinit, |
587 | 0 | *var, |
588 | 0 | pli, |
589 | 0 | dlist, |
590 | 0 | cdef_count, |
591 | 0 | level, |
592 | 0 | sec_strength, |
593 | 0 | pri_damping, |
594 | 0 | sec_damping, |
595 | 0 | coeff_shift, |
596 | 0 | 1); // no subsampling |
597 | 0 | } |
598 | 0 | } |
599 | 0 | cdef_left = 1; //CHKN filtered data is written back directy to recFrame. |
600 | 0 | } |
601 | 0 | { |
602 | 0 | uint8_t* tmp = prev_row_cdef; |
603 | 0 | prev_row_cdef = curr_row_cdef; |
604 | 0 | curr_row_cdef = tmp; |
605 | 0 | } |
606 | 0 | } |
607 | 0 | svt_aom_free(row_cdef); |
608 | 0 | for (int32_t pli = 0; pli < num_planes; pli++) { |
609 | 0 | svt_aom_free(linebuf[pli]); |
610 | 0 | svt_aom_free(colbuf[pli]); |
611 | 0 | } |
612 | 0 | } |
613 | | |
614 | | ///-------search |
615 | | /* |
616 | | * Search for the best luma+chroma strength to add as an option, knowing we |
617 | | * already selected nb_strengths options |
618 | | * |
619 | | * Params: |
620 | | * |
621 | | * lev0 : Array of indices of selected luma strengths. |
622 | | * lev1 : Array of indices of selected chroma strengths. |
623 | | * nb_strengths : Number of selected (Luma_strength, Chroma_strength) pairs. |
624 | | * mse : Array of luma and chroma filtering mse values. |
625 | | * sb_count : Number of filter blocks in the frame. |
626 | | * start_gi : starting strength index for the search of the additional strengths. |
627 | | * end_gi : End index for the for the search of the additional strengths. |
628 | | */ |
629 | | uint64_t svt_search_one_dual_c(int* lev0, int* lev1, int nb_strengths, uint64_t** mse[2], int sb_count, int start_gi, |
630 | 0 | int end_gi) { |
631 | 0 | uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; |
632 | 0 | int32_t i, j; |
633 | 0 | uint64_t best_tot_mse = (uint64_t)1 << 63; |
634 | 0 | int32_t best_id0 = 0; |
635 | 0 | int32_t best_id1 = 0; |
636 | 0 | const int32_t total_strengths = end_gi; |
637 | 0 | memset(tot_mse, 0, sizeof(tot_mse)); |
638 | | /* Loop over the filter blocks in the frame */ |
639 | 0 | for (i = 0; i < sb_count; i++) { |
640 | 0 | int32_t gi; |
641 | 0 | uint64_t best_mse = (uint64_t)1 << 63; |
642 | | /* Loop over the already selected nb_strengths (Luma_strength, |
643 | | Chroma_strength) pairs, and find the pair that has the smallest mse |
644 | | (best_mse) for the current filter block.*/ |
645 | | /* Find best mse among already selected options. */ |
646 | 0 | for (gi = 0; gi < nb_strengths; gi++) { |
647 | 0 | uint64_t curr = mse[0][i][lev0[gi]]; |
648 | 0 | curr += mse[1][i][lev1[gi]]; |
649 | 0 | if (curr < best_mse) { |
650 | 0 | best_mse = curr; |
651 | 0 | } |
652 | 0 | } |
653 | | /* Loop over the set of available (Luma_strength, Chroma_strength) |
654 | | pairs, identify any that provide an mse better than best_mse from the |
655 | | step above for the current filter block, and update any corresponding |
656 | | total mse (tot_mse[j][k]). */ |
657 | | /* Find best mse when adding each possible new option. */ |
658 | 0 | for (j = start_gi; j < total_strengths; j++) { |
659 | 0 | int32_t k; |
660 | 0 | for (k = start_gi; k < total_strengths; k++) { |
661 | 0 | uint64_t best = best_mse; |
662 | 0 | uint64_t curr = mse[0][i][j]; |
663 | 0 | curr += mse[1][i][k]; |
664 | 0 | if (curr < best) { |
665 | 0 | best = curr; |
666 | 0 | } |
667 | 0 | tot_mse[j][k] += best; |
668 | 0 | } |
669 | 0 | } |
670 | 0 | } |
671 | | /* Loop over the additionally searched (Luma_strength, Chroma_strength) pairs |
672 | | from the step above, and identify any such pair that provided the best mse for |
673 | | the whole frame. The identified pair would be added to the set of already selected pairs. */ |
674 | 0 | for (j = start_gi; j < total_strengths; j++) { // Loop over the additionally searched luma strengths |
675 | 0 | int32_t k; |
676 | 0 | for (k = start_gi; k < total_strengths; k++) { // Loop over the additionally searched chroma strengths |
677 | 0 | if (tot_mse[j][k] < best_tot_mse) { |
678 | 0 | best_tot_mse = tot_mse[j][k]; |
679 | 0 | best_id0 = j; // index for the best luma strength |
680 | 0 | best_id1 = k; // index for the best chroma strength |
681 | 0 | } |
682 | 0 | } |
683 | 0 | } |
684 | 0 | lev0[nb_strengths] = best_id0; // Add the identified luma strength to the list of selected luma strengths |
685 | 0 | lev1[nb_strengths] = best_id1; // Add the identified chroma strength to the list of selected chroma strengths |
686 | 0 | return best_tot_mse; |
687 | 0 | } |
688 | | |
689 | | /* |
690 | | * Search for the set of luma+chroma strengths that minimizes mse. |
691 | | * |
692 | | * Params: |
693 | | * |
694 | | * best_lev0 : Array of indices of selected luma strengths. |
695 | | * best_lev1 : Array of indices of selected chroma strengths. |
696 | | * nb_strengths : Number of selected (Luma_strength, Chroma_strength) pairs. |
697 | | * mse : Array of luma and chroma filtering mse values. |
698 | | * sb_count : Number of filter blocks in the frame. |
699 | | * start_gi : starting strength index for the search of the additional strengths. |
700 | | * end_gi : End index for the for the search of the additional strengths. |
701 | | */ |
702 | | static uint64_t joint_strength_search_dual(int32_t* best_lev0, int32_t* best_lev1, int32_t nb_strengths, |
703 | 0 | uint64_t** mse[2], int32_t sb_count, int32_t start_gi, int32_t end_gi) { |
704 | 0 | uint64_t best_tot_mse; |
705 | 0 | int32_t i; |
706 | 0 | best_tot_mse = (uint64_t)1 << 63; |
707 | | /* Greedy search: add one strength options at a time. |
708 | | |
709 | | Determine nb_strengths (Luma_strength, Chroma_strength) pairs. |
710 | | The list of nb_strengths pairs is determined by adding one such pair at |
711 | | a time through the call to the function search_one_dual. When the |
712 | | function search_one_dual is called, the search accounts for the |
713 | | strength pairs that have already been added in the previous iteration of |
714 | | the loop below. The loop below returns in the end best_tot_mse |
715 | | representing the best filtering mse for the whole frame based on the |
716 | | selected list of best (Luma_strength, Chroma_strength) pairs. |
717 | | */ |
718 | 0 | for (i = 0; i < nb_strengths; i++) { |
719 | 0 | best_tot_mse = svt_search_one_dual(best_lev0, best_lev1, i, mse, sb_count, start_gi, end_gi); |
720 | 0 | } |
721 | | /* Performing further refinements on the search based on the results |
722 | | from the step above. Trying to refine the greedy search by reconsidering each |
723 | | already-selected option. */ |
724 | 0 | for (i = 0; i < 4 * nb_strengths; i++) { |
725 | 0 | int32_t j; |
726 | 0 | for (j = 0; j < nb_strengths - 1; j++) { |
727 | 0 | best_lev0[j] = best_lev0[j + 1]; |
728 | 0 | best_lev1[j] = best_lev1[j + 1]; |
729 | 0 | } |
730 | 0 | best_tot_mse = svt_search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count, start_gi, end_gi); |
731 | 0 | } |
732 | 0 | return best_tot_mse; |
733 | 0 | } |
734 | | |
735 | | // This kernel is ported/adapted from libaom (AV1 reference implementation). |
736 | | // Original logic inspired by aom_pick_cdef_from_qp(). |
737 | | // Adjusted to match SVT-AV1 data structures and pipeline integration. |
738 | | static void svt_pick_cdef_from_qp(PictureParentControlSet* ppcs, int32_t is_screen_content, int32_t* pred_y_strength, |
739 | 257 | int32_t* pred_uv_strength) { |
740 | 257 | FrameHeader* frm_hdr = &ppcs->frm_hdr; |
741 | 257 | const uint8_t bit_depth = ppcs->enhanced_pic->bit_depth; |
742 | 257 | const int32_t base_q_idx = frm_hdr->quantization_params.base_q_idx; |
743 | | |
744 | 257 | int32_t q = svt_aom_ac_quant_qtx(base_q_idx, 0, bit_depth); |
745 | 257 | q >>= (bit_depth - 8); |
746 | | |
747 | 257 | int32_t y_f1 = 0, y_f2 = 0; |
748 | 257 | int32_t uv_f1 = 0, uv_f2 = 0; |
749 | | |
750 | 257 | const int32_t is_intra = (frm_hdr->frame_type == KEY_FRAME || frm_hdr->frame_type == INTRA_ONLY_FRAME); |
751 | | |
752 | 257 | if (is_screen_content) { |
753 | 0 | y_f1 = (int32_t)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02); |
754 | |
|
755 | 0 | y_f2 = (int32_t)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01); |
756 | |
|
757 | 0 | uv_f1 = (int32_t)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01); |
758 | |
|
759 | 0 | uv_f2 = (int32_t)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0); |
760 | 257 | } else if (!is_intra) { |
761 | 0 | y_f1 = (int32_t)roundf(q * q * -0.0000023593946f + q * 0.0068615186f + 0.02709886f); |
762 | |
|
763 | 0 | y_f2 = (int32_t)roundf(q * q * -0.00000057629734f + q * 0.0013993345f + 0.03831067f); |
764 | |
|
765 | 0 | uv_f1 = (int32_t)roundf(q * q * -0.0000007095069f + q * 0.0034628846f + 0.00887099f); |
766 | |
|
767 | 0 | uv_f2 = (int32_t)roundf(q * q * 0.00000023874085f + q * 0.00028223585f + 0.05576307f); |
768 | 257 | } else { // Intra |
769 | 257 | y_f1 = (int32_t)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f); |
770 | | |
771 | 257 | y_f2 = (int32_t)roundf(q * q * 0.0000029167343f + q * 0.0027798624f + 0.0079405f); |
772 | | |
773 | 257 | uv_f1 = (int32_t)roundf(q * q * -0.0000130790995f + q * 0.012892405f - 0.00748388f); |
774 | | |
775 | 257 | uv_f2 = (int32_t)roundf(q * q * 0.0000032651783f + q * 0.00035520183f + 0.00228092f); |
776 | 257 | } |
777 | | |
778 | | // Clamp to AV1 limits |
779 | 257 | y_f1 = clamp(y_f1, 0, 15); |
780 | 257 | y_f2 = clamp(y_f2, 0, 3); |
781 | 257 | uv_f1 = clamp(uv_f1, 0, 15); |
782 | 257 | uv_f2 = clamp(uv_f2, 0, 3); |
783 | | |
784 | | // Pack primary + secondary |
785 | 257 | *pred_y_strength = y_f1 * CDEF_SEC_STRENGTHS + y_f2; |
786 | 257 | *pred_uv_strength = uv_f1 * CDEF_SEC_STRENGTHS + uv_f2; |
787 | 257 | } |
788 | | |
789 | | #if CLN_FINISH_CDEF |
790 | | |
791 | | // Propagate cdef_strength to all 64x64 mi |
792 | 3.21k | static INLINE void propagate_cdef_strength(PictureControlSet* pcs, int32_t sb_index, int8_t strength) { |
793 | 3.21k | MbModeInfo* mbmi = pcs->mi_grid_base[sb_index]; |
794 | 3.21k | mbmi->cdef_strength = strength; |
795 | 3.21k | switch (mbmi->bsize) { |
796 | 0 | case BLOCK_128X128: |
797 | 0 | pcs->mi_grid_base[sb_index + MI_SIZE_64X64]->cdef_strength = strength; |
798 | 0 | pcs->mi_grid_base[sb_index + MI_SIZE_64X64 * pcs->mi_stride]->cdef_strength = strength; |
799 | 0 | pcs->mi_grid_base[sb_index + MI_SIZE_64X64 * pcs->mi_stride + MI_SIZE_64X64]->cdef_strength = strength; |
800 | 0 | break; |
801 | 0 | case BLOCK_128X64: |
802 | 0 | pcs->mi_grid_base[sb_index + MI_SIZE_64X64]->cdef_strength = strength; |
803 | 0 | break; |
804 | 0 | case BLOCK_64X128: |
805 | 0 | pcs->mi_grid_base[sb_index + MI_SIZE_64X64 * pcs->mi_stride]->cdef_strength = strength; |
806 | 0 | break; |
807 | 3.21k | default: |
808 | 3.21k | break; |
809 | 3.21k | } |
810 | 3.21k | } |
811 | | |
812 | 257 | #define CDEF_DAMPING_FROM_QP(base_q_idx) (3 + ((base_q_idx) >> 6)) |
813 | | |
814 | 257 | void finish_cdef_search(PictureControlSet* pcs) { |
815 | 257 | PictureParentControlSet* ppcs = pcs->ppcs; |
816 | 257 | FrameHeader* frm_hdr = &ppcs->frm_hdr; |
817 | 257 | Av1Common* cm = ppcs->av1_cm; |
818 | 257 | int32_t mi_rows = ppcs->av1_cm->mi_rows; |
819 | 257 | int32_t mi_cols = ppcs->av1_cm->mi_cols; |
820 | | |
821 | 257 | int32_t fbr, fbc; |
822 | 257 | uint64_t best_tot_mse = (uint64_t)1 << 63; |
823 | 257 | int32_t sb_count; |
824 | 257 | int32_t nvfb = (mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; |
825 | 257 | int32_t nhfb = (mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; |
826 | | |
827 | 257 | CdefSearchControls* cdef_search_ctrls = &pcs->ppcs->cdef_search_ctrls; |
828 | | |
829 | 257 | if (cdef_search_ctrls->use_qp_strength) { |
830 | 257 | #if OPT_SC_CDEF_QP |
831 | 257 | const bool allintra = ppcs->scs->allintra; |
832 | 257 | const uint8_t sc_class1 = ppcs->sc_class1; |
833 | 257 | const uint8_t sc_class5 = ppcs->sc_class5; |
834 | 257 | const uint8_t sc = allintra ? sc_class5 : sc_class1; |
835 | 257 | int pred_y, pred_uv; |
836 | 257 | svt_pick_cdef_from_qp(ppcs, sc, &pred_y, &pred_uv); |
837 | | #else |
838 | | int pred_y, pred_uv; |
839 | | svt_pick_cdef_from_qp(ppcs, 0, &pred_y, &pred_uv); |
840 | | #endif |
841 | 257 | frm_hdr->cdef_params.cdef_bits = 0; |
842 | 257 | ppcs->nb_cdef_strengths = 1; |
843 | 257 | frm_hdr->cdef_params.cdef_y_strength[0] = pred_y; |
844 | 257 | frm_hdr->cdef_params.cdef_uv_strength[0] = pred_uv; |
845 | 257 | frm_hdr->cdef_params.cdef_damping = CDEF_DAMPING_FROM_QP(frm_hdr->quantization_params.base_q_idx); |
846 | | |
847 | 1.16k | for (fbr = 0; fbr < nvfb; ++fbr) { |
848 | 4.45k | for (fbc = 0; fbc < nhfb; ++fbc) { |
849 | 3.55k | const int32_t sb_idx = MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc; |
850 | 3.55k | const MbModeInfo* mbmi = pcs->mi_grid_base[sb_idx]; |
851 | | |
852 | 3.55k | if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || |
853 | 3.55k | ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) { |
854 | 0 | continue; |
855 | 0 | } |
856 | 3.55k | if (svt_sb_all_skip(pcs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) { |
857 | 336 | continue; |
858 | 336 | } |
859 | | |
860 | 3.21k | propagate_cdef_strength(pcs, sb_idx, 0); |
861 | 3.21k | } |
862 | 905 | } |
863 | 257 | return; |
864 | 257 | } |
865 | | |
866 | 0 | CdefReconControls* cdef_recon_ctrls = &pcs->ppcs->cdef_recon_ctrls; |
867 | 0 | const int first_pass_fs_num = cdef_search_ctrls->first_pass_fs_num; |
868 | 0 | const int default_second_pass_fs_num = cdef_search_ctrls->default_second_pass_fs_num; |
869 | |
|
870 | 0 | frm_hdr->cdef_params.cdef_bits = 0; |
871 | 0 | ppcs->nb_cdef_strengths = 1; |
872 | 0 | frm_hdr->cdef_params.cdef_y_strength[0] = cdef_search_ctrls->pred_y_f; |
873 | 0 | frm_hdr->cdef_params.cdef_uv_strength[0] = cdef_search_ctrls->pred_uv_f; |
874 | 0 | frm_hdr->cdef_params.cdef_damping = CDEF_DAMPING_FROM_QP(frm_hdr->quantization_params.base_q_idx); |
875 | |
|
876 | 0 | if (cdef_search_ctrls->use_reference_cdef_fs) { |
877 | 0 | for (fbr = 0; fbr < nvfb; ++fbr) { |
878 | 0 | for (fbc = 0; fbc < nhfb; ++fbc) { |
879 | 0 | const int32_t sb_idx = MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc; |
880 | 0 | const MbModeInfo* mbmi = pcs->mi_grid_base[sb_idx]; |
881 | |
|
882 | 0 | if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || |
883 | 0 | ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) { |
884 | 0 | continue; |
885 | 0 | } |
886 | 0 | if (svt_sb_all_skip(pcs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) { |
887 | 0 | continue; |
888 | 0 | } |
889 | | |
890 | 0 | propagate_cdef_strength(pcs, sb_idx, 0); |
891 | 0 | } |
892 | 0 | } |
893 | 0 | return; |
894 | 0 | } |
895 | | |
896 | 0 | int32_t* sb_index; |
897 | 0 | EB_MALLOC_ARRAY_NO_CHECK(sb_index, nvfb * nhfb); |
898 | 0 | assert(sb_index != NULL); |
899 | |
|
900 | 0 | uint64_t** mse[2]; |
901 | 0 | EB_MALLOC_ARRAY_NO_CHECK(mse[0], nvfb * nhfb); |
902 | 0 | EB_MALLOC_ARRAY_NO_CHECK(mse[1], nvfb * nhfb); |
903 | 0 | assert(mse[0] != NULL); |
904 | 0 | assert(mse[1] != NULL); |
905 | |
|
906 | 0 | const int32_t start_gi = 0; |
907 | 0 | const int32_t end_gi = first_pass_fs_num + default_second_pass_fs_num; |
908 | 0 | int32_t i; |
909 | 0 | int32_t nb_strengths; |
910 | 0 | int32_t nb_strength_bits = 0; |
911 | 0 | uint64_t lambda; |
912 | 0 | uint32_t fast_lambda, full_lambda = 0; |
913 | |
|
914 | 0 | svt_aom_lambda_assign(pcs, |
915 | 0 | &fast_lambda, |
916 | 0 | &full_lambda, |
917 | 0 | pcs->ppcs->enhanced_pic->bit_depth, |
918 | 0 | pcs->ppcs->frm_hdr.quantization_params.base_q_idx, |
919 | 0 | false); |
920 | 0 | lambda = full_lambda; |
921 | 0 | sb_count = 0; |
922 | |
|
923 | 0 | for (fbr = 0; fbr < nvfb; ++fbr) { |
924 | 0 | for (fbc = 0; fbc < nhfb; ++fbc) { |
925 | 0 | const MbModeInfo* mbmi = pcs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]; |
926 | 0 | if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || |
927 | 0 | ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) { |
928 | 0 | continue; |
929 | 0 | } |
930 | 0 | if (pcs->skip_cdef_seg[fbr * nhfb + fbc]) { |
931 | 0 | continue; |
932 | 0 | } |
933 | | |
934 | 0 | mse[0][sb_count] = pcs->mse_seg[0][fbr * nhfb + fbc]; |
935 | 0 | mse[1][sb_count] = pcs->mse_seg[1][fbr * nhfb + fbc]; |
936 | 0 | sb_index[sb_count] = MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc; |
937 | 0 | sb_count++; |
938 | 0 | } |
939 | 0 | } |
940 | | |
941 | | // Scale down the cost of the (0,0) filter strength to bias selection towards off. When off, we can save the cost of the application |
942 | 0 | if (cdef_recon_ctrls->zero_fs_cost_bias) { |
943 | 0 | const bool is_16bit = (pcs->scs->static_config.encoder_bit_depth > EB_EIGHT_BIT); |
944 | 0 | uint16_t factor; |
945 | 0 | for (i = 0; i < sb_count; i++) { |
946 | 0 | if (is_16bit) { |
947 | 0 | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
948 | 0 | if (mse[0][i][0] < 5000) { |
949 | 0 | factor = MIN(factor - 10, 64); |
950 | 0 | } else if (mse[0][i][0] < 10000) { |
951 | 0 | factor = MIN(factor - 5, 64); |
952 | 0 | } else if (mse[0][i][0] > 25000) { |
953 | 0 | factor = MIN(factor + 1, 64); |
954 | 0 | } |
955 | 0 | mse[0][i][0] = (factor * mse[0][i][0]) >> 6; |
956 | |
|
957 | 0 | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
958 | 0 | if (mse[1][i][0] < 5000) { |
959 | 0 | factor = MIN(factor - 10, 64); |
960 | 0 | } else if (mse[1][i][0] < 10000) { |
961 | 0 | factor = MIN(factor - 5, 64); |
962 | 0 | } else if (mse[1][i][0] > 25000) { |
963 | 0 | factor = MIN(factor + 1, 64); |
964 | 0 | } |
965 | 0 | mse[1][i][0] = (factor * mse[1][i][0]) >> 6; |
966 | 0 | } else { |
967 | 0 | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
968 | 0 | if (mse[0][i][0] > 25000) { |
969 | 0 | factor = MIN(factor + 2, 64); |
970 | 0 | } else if (mse[0][i][0] > 10000) { |
971 | 0 | factor = MIN(factor + 1, 64); |
972 | 0 | } |
973 | 0 | mse[0][i][0] = (factor * mse[0][i][0]) >> 6; |
974 | |
|
975 | 0 | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
976 | 0 | if (mse[1][i][0] > 25000) { |
977 | 0 | factor = MIN(factor + 2, 64); |
978 | 0 | } else if (mse[1][i][0] > 10000) { |
979 | 0 | factor = MIN(factor + 1, 64); |
980 | 0 | } |
981 | 0 | mse[1][i][0] = (factor * mse[1][i][0]) >> 6; |
982 | 0 | } |
983 | 0 | } |
984 | 0 | } |
985 | | |
986 | | // Compute cost of (strength=0) to derive pcs->cdef_dist_dev |
987 | 0 | int64_t zero_dist = 0; |
988 | 0 | for (i = 0; i < sb_count; i++) { |
989 | 0 | zero_dist += mse[0][i][0] + mse[1][i][0]; |
990 | 0 | } |
991 | 0 | uint64_t zero_cost = RDCOST(lambda, av1_cost_literal(CDEF_STRENGTH_BITS * 2), zero_dist << 4); |
992 | | |
993 | | // Search for different numbers of signalling bits |
994 | 0 | for (i = 0; i <= 3; i++) { |
995 | 0 | int32_t best_lev0[CDEF_MAX_STRENGTHS] = {0}; |
996 | 0 | int32_t best_lev1[CDEF_MAX_STRENGTHS] = {0}; |
997 | 0 | nb_strengths = 1 << i; |
998 | 0 | uint64_t tot_mse = joint_strength_search_dual( |
999 | 0 | best_lev0, best_lev1, nb_strengths, mse, sb_count, start_gi, end_gi); |
1000 | |
|
1001 | 0 | const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS * 2; |
1002 | 0 | const uint64_t cost = RDCOST(lambda, av1_cost_literal(total_bits), tot_mse * 16); |
1003 | 0 | if (cost < best_tot_mse) { |
1004 | 0 | best_tot_mse = cost; |
1005 | 0 | nb_strength_bits = i; |
1006 | 0 | for (int32_t j = 0; j < 1 << nb_strength_bits; j++) { |
1007 | 0 | frm_hdr->cdef_params.cdef_y_strength[j] = best_lev0[j]; |
1008 | 0 | frm_hdr->cdef_params.cdef_uv_strength[j] = cdef_search_ctrls->uv_from_y ? best_lev0[j] : best_lev1[j]; |
1009 | 0 | } |
1010 | 0 | } |
1011 | 0 | } |
1012 | |
|
1013 | 0 | pcs->cdef_dist_dev = zero_cost == 0 ? 0 : (int32_t)(1000 - ((1000 * best_tot_mse) / zero_cost)); |
1014 | 0 | nb_strengths = 1 << nb_strength_bits; |
1015 | |
|
1016 | 0 | frm_hdr->cdef_params.cdef_bits = nb_strength_bits; |
1017 | 0 | ppcs->nb_cdef_strengths = nb_strengths; |
1018 | | |
1019 | | // Assign each filter block its best strength index |
1020 | 0 | for (i = 0; i < sb_count; i++) { |
1021 | 0 | int32_t gi; |
1022 | 0 | int32_t best_gi = 0; |
1023 | 0 | uint64_t best_mse = (uint64_t)1 << 63; |
1024 | 0 | for (gi = 0; gi < ppcs->nb_cdef_strengths; gi++) { |
1025 | 0 | uint64_t curr = mse[0][i][frm_hdr->cdef_params.cdef_y_strength[gi]] + |
1026 | 0 | mse[1][i][frm_hdr->cdef_params.cdef_uv_strength[gi]]; |
1027 | 0 | if (curr < best_mse) { |
1028 | 0 | best_gi = gi; |
1029 | 0 | best_mse = curr; |
1030 | 0 | } |
1031 | 0 | } |
1032 | 0 | propagate_cdef_strength(pcs, sb_index[i], (int8_t)best_gi); |
1033 | 0 | } |
1034 | | |
1035 | | // Map search indices back to actual filter strengths |
1036 | 0 | int filter_map[TOTAL_STRENGTHS] = {0}; |
1037 | 0 | for (i = 0; i < first_pass_fs_num; i++) { |
1038 | 0 | filter_map[i] = cdef_search_ctrls->default_first_pass_fs[i]; |
1039 | 0 | } |
1040 | 0 | for (i = 0; i < default_second_pass_fs_num; i++) { |
1041 | 0 | filter_map[first_pass_fs_num + i] = cdef_search_ctrls->default_second_pass_fs[i]; |
1042 | 0 | } |
1043 | |
|
1044 | 0 | for (i = 0; i < ppcs->nb_cdef_strengths; i++) { |
1045 | 0 | frm_hdr->cdef_params.cdef_y_strength[i] = filter_map[frm_hdr->cdef_params.cdef_y_strength[i]]; |
1046 | 0 | frm_hdr->cdef_params.cdef_uv_strength[i] = filter_map[frm_hdr->cdef_params.cdef_uv_strength[i]]; |
1047 | 0 | } |
1048 | |
|
1049 | 0 | frm_hdr->cdef_params.cdef_damping = CDEF_DAMPING_FROM_QP(frm_hdr->quantization_params.base_q_idx); |
1050 | |
|
1051 | 0 | EB_FREE_ARRAY(mse[0]); |
1052 | 0 | EB_FREE_ARRAY(mse[1]); |
1053 | | EB_FREE_ARRAY(sb_index); |
1054 | 0 | } |
1055 | | #else |
1056 | | void finish_cdef_search(PictureControlSet* pcs) { |
1057 | | PictureParentControlSet* ppcs = pcs->ppcs; |
1058 | | FrameHeader* frm_hdr = &ppcs->frm_hdr; |
1059 | | Av1Common* cm = ppcs->av1_cm; |
1060 | | int32_t mi_rows = ppcs->av1_cm->mi_rows; |
1061 | | int32_t mi_cols = ppcs->av1_cm->mi_cols; |
1062 | | |
1063 | | int32_t fbr, fbc; |
1064 | | uint64_t best_tot_mse = (uint64_t)1 << 63; |
1065 | | int32_t sb_count; |
1066 | | int32_t nvfb = (mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; |
1067 | | int32_t nhfb = (mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; |
1068 | | //CDEF Settings |
1069 | | CdefSearchControls* cdef_search_ctrls = &pcs->ppcs->cdef_search_ctrls; |
1070 | | if (cdef_search_ctrls->use_qp_strength) { |
1071 | | int pred_y, pred_uv; |
1072 | | |
1073 | | // Predict Y/UV strengths from QP |
1074 | | svt_pick_cdef_from_qp(ppcs, 0, &pred_y, &pred_uv); |
1075 | | |
1076 | | // Frame-level parameters |
1077 | | frm_hdr->cdef_params.cdef_bits = 0; // only one strength index |
1078 | | ppcs->nb_cdef_strengths = 1; |
1079 | | frm_hdr->cdef_params.cdef_y_strength[0] = pred_y; |
1080 | | frm_hdr->cdef_params.cdef_uv_strength[0] = pred_uv; |
1081 | | frm_hdr->cdef_params.cdef_damping = 3 + (frm_hdr->quantization_params.base_q_idx >> 6); |
1082 | | |
1083 | | // Assign strength index 0 to all valid 64x64 blocks |
1084 | | for (fbr = 0; fbr < nvfb; ++fbr) { |
1085 | | for (fbc = 0; fbc < nhfb; ++fbc) { |
1086 | | MbModeInfo* mbmi = pcs->mi_grid_base[MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc]; |
1087 | | |
1088 | | // Skip duplicated 64x64 blocks inside larger 128x128/128x64/64x128 |
1089 | | if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || |
1090 | | ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) { |
1091 | | continue; |
1092 | | } |
1093 | | |
1094 | | // No filtering if the entire filter block is skipped |
1095 | | if (svt_sb_all_skip(pcs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) { |
1096 | | continue; |
1097 | | } |
1098 | | |
1099 | | mbmi->cdef_strength = 0; |
1100 | | |
1101 | | // Duplicate for large blocks in SVT MI map |
1102 | | switch (mbmi->bsize) { |
1103 | | case BLOCK_128X128: |
1104 | | pcs->mi_grid_base[MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc + MI_SIZE_64X64] |
1105 | | ->cdef_strength = 0; |
1106 | | |
1107 | | pcs->mi_grid_base[(MI_SIZE_64X64 * fbr + MI_SIZE_64X64) * pcs->mi_stride + MI_SIZE_64X64 * fbc] |
1108 | | ->cdef_strength = 0; |
1109 | | |
1110 | | pcs->mi_grid_base[(MI_SIZE_64X64 * fbr + MI_SIZE_64X64) * pcs->mi_stride + MI_SIZE_64X64 * fbc + |
1111 | | MI_SIZE_64X64] |
1112 | | ->cdef_strength = 0; |
1113 | | break; |
1114 | | |
1115 | | case BLOCK_128X64: |
1116 | | pcs->mi_grid_base[MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc + MI_SIZE_64X64] |
1117 | | ->cdef_strength = 0; |
1118 | | break; |
1119 | | |
1120 | | case BLOCK_64X128: |
1121 | | pcs->mi_grid_base[(MI_SIZE_64X64 * fbr + MI_SIZE_64X64) * pcs->mi_stride + MI_SIZE_64X64 * fbc] |
1122 | | ->cdef_strength = 0; |
1123 | | break; |
1124 | | |
1125 | | default: |
1126 | | break; |
1127 | | } |
1128 | | } |
1129 | | } |
1130 | | return; |
1131 | | } |
1132 | | |
1133 | | CdefReconControls* cdef_recon_ctrls = &pcs->ppcs->cdef_recon_ctrls; |
1134 | | const int first_pass_fs_num = cdef_search_ctrls->first_pass_fs_num; |
1135 | | const int default_second_pass_fs_num = cdef_search_ctrls->default_second_pass_fs_num; |
1136 | | |
1137 | | if (cdef_search_ctrls->use_reference_cdef_fs) { |
1138 | | int32_t* sb_index; |
1139 | | EB_MALLOC_ARRAY_NO_CHECK(sb_index, nvfb * nhfb); |
1140 | | int32_t best_gi = 0; |
1141 | | sb_count = 0; |
1142 | | assert(sb_index != NULL); |
1143 | | for (fbr = 0; fbr < nvfb; ++fbr) { |
1144 | | for (fbc = 0; fbc < nhfb; ++fbc) { |
1145 | | const MbModeInfo* mbmi = pcs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]; |
1146 | | if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || |
1147 | | ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) { |
1148 | | continue; |
1149 | | } |
1150 | | // No filtering if the entire filter block is skipped |
1151 | | if (svt_sb_all_skip(pcs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) { |
1152 | | continue; |
1153 | | } |
1154 | | sb_index[sb_count] = MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc; |
1155 | | sb_count++; |
1156 | | } |
1157 | | } |
1158 | | for (int32_t i = 0; i < sb_count; i++) { |
1159 | | pcs->mi_grid_base[sb_index[i]]->cdef_strength = (int8_t)best_gi; |
1160 | | //in case the fb is within a block=128x128 or 128x64, or 64x128, then we genrate param only for the first 64x64. |
1161 | | //since our mi map deos not have the multi pointer single data assignment, we need to duplicate data. |
1162 | | BlockSize bsize = pcs->mi_grid_base[sb_index[i]]->bsize; |
1163 | | switch (bsize) { |
1164 | | case BLOCK_128X128: |
1165 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->cdef_strength = (int8_t)best_gi; |
1166 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * pcs->mi_stride]->cdef_strength = (int8_t)best_gi; |
1167 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * pcs->mi_stride + MI_SIZE_64X64]->cdef_strength = |
1168 | | (int8_t)best_gi; |
1169 | | break; |
1170 | | case BLOCK_128X64: |
1171 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->cdef_strength = (int8_t)best_gi; |
1172 | | break; |
1173 | | case BLOCK_64X128: |
1174 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * pcs->mi_stride]->cdef_strength = (int8_t)best_gi; |
1175 | | break; |
1176 | | default: |
1177 | | break; |
1178 | | } |
1179 | | } |
1180 | | frm_hdr->cdef_params.cdef_bits = 0; |
1181 | | ppcs->nb_cdef_strengths = 1; |
1182 | | //cdef_pri_damping & cdef_sec_damping consolidated to cdef_damping |
1183 | | int32_t pri_damping = 3 + (frm_hdr->quantization_params.base_q_idx >> 6); |
1184 | | frm_hdr->cdef_params.cdef_damping = pri_damping; |
1185 | | frm_hdr->cdef_params.cdef_y_strength[0] = cdef_search_ctrls->pred_y_f; |
1186 | | frm_hdr->cdef_params.cdef_uv_strength[0] = cdef_search_ctrls->pred_uv_f; |
1187 | | EB_FREE_ARRAY(sb_index); |
1188 | | return; |
1189 | | } |
1190 | | int32_t* sb_index; |
1191 | | // to keep track of the sb_address in units of SBs (not mi_size) |
1192 | | int32_t* sb_addr; |
1193 | | EB_MALLOC_ARRAY_NO_CHECK(sb_index, nvfb * nhfb); |
1194 | | EB_MALLOC_ARRAY_NO_CHECK(sb_addr, nvfb * nhfb); |
1195 | | assert(sb_index != NULL); |
1196 | | assert(sb_addr != NULL); |
1197 | | |
1198 | | uint64_t** mse[2]; |
1199 | | EB_MALLOC_ARRAY_NO_CHECK(mse[0], nvfb * nhfb); |
1200 | | EB_MALLOC_ARRAY_NO_CHECK(mse[1], nvfb * nhfb); |
1201 | | assert(mse[0] != NULL); |
1202 | | assert(mse[1] != NULL); |
1203 | | |
1204 | | int32_t start_gi = 0; |
1205 | | int32_t end_gi = first_pass_fs_num + default_second_pass_fs_num; |
1206 | | int32_t i; |
1207 | | int32_t nb_strengths; |
1208 | | int32_t nb_strength_bits; |
1209 | | uint64_t lambda; |
1210 | | uint32_t fast_lambda, full_lambda = 0; |
1211 | | |
1212 | | svt_aom_lambda_assign(pcs, |
1213 | | &fast_lambda, |
1214 | | &full_lambda, |
1215 | | pcs->ppcs->enhanced_pic->bit_depth, |
1216 | | pcs->ppcs->frm_hdr.quantization_params.base_q_idx, |
1217 | | false); |
1218 | | lambda = full_lambda; |
1219 | | sb_count = 0; |
1220 | | for (fbr = 0; fbr < nvfb; ++fbr) { |
1221 | | for (fbc = 0; fbc < nhfb; ++fbc) { |
1222 | | const MbModeInfo* mbmi = pcs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]; |
1223 | | if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || |
1224 | | ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) { |
1225 | | continue; |
1226 | | } |
1227 | | |
1228 | | // No filtering if the entire filter block is skipped |
1229 | | if (pcs->skip_cdef_seg[fbr * nhfb + fbc]) { |
1230 | | continue; |
1231 | | } |
1232 | | // point to the MSE data |
1233 | | mse[0][sb_count] = pcs->mse_seg[0][fbr * nhfb + fbc]; |
1234 | | mse[1][sb_count] = pcs->mse_seg[1][fbr * nhfb + fbc]; |
1235 | | |
1236 | | sb_index[sb_count] = MI_SIZE_64X64 * fbr * pcs->mi_stride + MI_SIZE_64X64 * fbc; |
1237 | | sb_addr[sb_count] = fbr * nhfb + fbc; |
1238 | | sb_count++; |
1239 | | } |
1240 | | } |
1241 | | |
1242 | | nb_strength_bits = 0; |
1243 | | // Scale down the cost of the (0,0) filter strength to bias selection towards off. |
1244 | | // When off, can save the cost of the application. |
1245 | | if (cdef_recon_ctrls->zero_fs_cost_bias) { |
1246 | | const bool is_16bit = (pcs->scs->static_config.encoder_bit_depth > EB_EIGHT_BIT); |
1247 | | uint16_t factor; |
1248 | | for (i = 0; i < sb_count; i++) { |
1249 | | if (is_16bit) { |
1250 | | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
1251 | | if (mse[0][i][0] < 5000) { |
1252 | | factor = MIN(factor - 10, 64); |
1253 | | } else if (mse[0][i][0] < 10000) { |
1254 | | factor = MIN(factor - 5, 64); |
1255 | | } else if (mse[0][i][0] > 25000) { |
1256 | | factor = MIN(factor + 1, 64); |
1257 | | } |
1258 | | mse[0][i][0] = (factor * mse[0][i][0]) >> 6; |
1259 | | |
1260 | | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
1261 | | if (mse[1][i][0] < 5000) { |
1262 | | factor = MIN(factor - 10, 64); |
1263 | | } else if (mse[1][i][0] < 10000) { |
1264 | | factor = MIN(factor - 5, 64); |
1265 | | } else if (mse[1][i][0] > 25000) { |
1266 | | factor = MIN(factor + 1, 64); |
1267 | | } |
1268 | | mse[1][i][0] = (factor * mse[1][i][0]) >> 6; |
1269 | | } else { |
1270 | | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
1271 | | if (mse[0][i][0] > 25000) { |
1272 | | factor = MIN(factor + 2, 64); |
1273 | | } else if (mse[0][i][0] > 10000) { |
1274 | | factor = MIN(factor + 1, 64); |
1275 | | } |
1276 | | mse[0][i][0] = (factor * mse[0][i][0]) >> 6; |
1277 | | |
1278 | | factor = cdef_recon_ctrls->zero_fs_cost_bias; |
1279 | | if (mse[1][i][0] > 25000) { |
1280 | | factor = MIN(factor + 2, 64); |
1281 | | } else if (mse[1][i][0] > 10000) { |
1282 | | factor = MIN(factor + 1, 64); |
1283 | | } |
1284 | | |
1285 | | mse[1][i][0] = (factor * mse[1][i][0]) >> 6; |
1286 | | } |
1287 | | } |
1288 | | } |
1289 | | // Compute cost of off to use in deriving pcs->cdef_dist_dev |
1290 | | int64_t zero_dist = 0; |
1291 | | for (i = 0; i < sb_count; i++) { |
1292 | | zero_dist += mse[0][i][0] + mse[1][i][0]; |
1293 | | } |
1294 | | uint64_t zero_cost = RDCOST(lambda, av1_cost_literal(CDEF_STRENGTH_BITS * 2), zero_dist << 4); |
1295 | | /* Search for different number of signalling bits. */ |
1296 | | for (i = 0; i <= 3; i++) { |
1297 | | int32_t best_lev0[CDEF_MAX_STRENGTHS] = {0}; |
1298 | | int32_t best_lev1[CDEF_MAX_STRENGTHS] = {0}; |
1299 | | nb_strengths = 1 << i; |
1300 | | uint64_t tot_mse = joint_strength_search_dual( |
1301 | | best_lev0, best_lev1, nb_strengths, mse, sb_count, start_gi, end_gi); |
1302 | | /* Count superblock signalling cost. */ |
1303 | | const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS * 2; |
1304 | | const int rate_cost = av1_cost_literal(total_bits); |
1305 | | const uint64_t dist = tot_mse * 16; |
1306 | | tot_mse = RDCOST(lambda, rate_cost, dist); |
1307 | | if (tot_mse < best_tot_mse) { |
1308 | | best_tot_mse = tot_mse; |
1309 | | nb_strength_bits = i; |
1310 | | for (int32_t j = 0; j < 1 << nb_strength_bits; j++) { |
1311 | | frm_hdr->cdef_params.cdef_y_strength[j] = best_lev0[j]; |
1312 | | frm_hdr->cdef_params.cdef_uv_strength[j] = cdef_search_ctrls->uv_from_y ? best_lev0[j] : best_lev1[j]; |
1313 | | } |
1314 | | } |
1315 | | } |
1316 | | pcs->cdef_dist_dev = zero_cost == 0 ? 0 : (int32_t)(1000 - ((1000 * best_tot_mse) / zero_cost)); |
1317 | | nb_strengths = 1 << nb_strength_bits; |
1318 | | |
1319 | | frm_hdr->cdef_params.cdef_bits = nb_strength_bits; |
1320 | | ppcs->nb_cdef_strengths = nb_strengths; |
1321 | | for (i = 0; i < sb_count; i++) { |
1322 | | int32_t gi; |
1323 | | int32_t best_gi; |
1324 | | uint64_t best_mse = (uint64_t)1 << 63; |
1325 | | best_gi = 0; |
1326 | | // skip this loop for SBs that are skipped in the search |
1327 | | for (gi = 0; gi < ppcs->nb_cdef_strengths; gi++) { |
1328 | | uint64_t curr = mse[0][i][frm_hdr->cdef_params.cdef_y_strength[gi]]; |
1329 | | curr += mse[1][i][frm_hdr->cdef_params.cdef_uv_strength[gi]]; |
1330 | | if (curr < best_mse) { |
1331 | | best_gi = gi; |
1332 | | best_mse = curr; |
1333 | | } |
1334 | | } |
1335 | | |
1336 | | pcs->mi_grid_base[sb_index[i]]->cdef_strength = (int8_t)best_gi; |
1337 | | //in case the fb is within a block=128x128 or 128x64, or 64x128, then we genrate param only for the first 64x64. |
1338 | | //since our mi map deos not have the multi pointer single data assignment, we need to duplicate data. |
1339 | | BlockSize bsize = pcs->mi_grid_base[sb_index[i]]->bsize; |
1340 | | |
1341 | | switch (bsize) { |
1342 | | case BLOCK_128X128: |
1343 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->cdef_strength = (int8_t)best_gi; |
1344 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * pcs->mi_stride]->cdef_strength = (int8_t)best_gi; |
1345 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * pcs->mi_stride + MI_SIZE_64X64]->cdef_strength = (int8_t) |
1346 | | best_gi; |
1347 | | break; |
1348 | | case BLOCK_128X64: |
1349 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->cdef_strength = (int8_t)best_gi; |
1350 | | break; |
1351 | | case BLOCK_64X128: |
1352 | | pcs->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * pcs->mi_stride]->cdef_strength = (int8_t)best_gi; |
1353 | | break; |
1354 | | default: |
1355 | | break; |
1356 | | } |
1357 | | } |
1358 | | int filter_map[TOTAL_STRENGTHS] = {0}; |
1359 | | for (i = 0; i < first_pass_fs_num; i++) { |
1360 | | filter_map[i] = cdef_search_ctrls->default_first_pass_fs[i]; |
1361 | | } |
1362 | | for (i = first_pass_fs_num; i < (first_pass_fs_num + default_second_pass_fs_num); i++) { |
1363 | | filter_map[i] = cdef_search_ctrls->default_second_pass_fs[i - first_pass_fs_num]; |
1364 | | } |
1365 | | |
1366 | | for (i = 0; i < ppcs->nb_cdef_strengths; i++) { |
1367 | | frm_hdr->cdef_params.cdef_y_strength[i] = filter_map[frm_hdr->cdef_params.cdef_y_strength[i]]; |
1368 | | frm_hdr->cdef_params.cdef_uv_strength[i] = filter_map[frm_hdr->cdef_params.cdef_uv_strength[i]]; |
1369 | | } |
1370 | | //cdef_pri_damping & cdef_sec_damping consolidated to cdef_damping |
1371 | | frm_hdr->cdef_params.cdef_damping = 3 + (frm_hdr->quantization_params.base_q_idx >> 6); |
1372 | | EB_FREE_ARRAY(mse[0]); |
1373 | | EB_FREE_ARRAY(mse[1]); |
1374 | | EB_FREE_ARRAY(sb_index); |
1375 | | EB_FREE_ARRAY(sb_addr); |
1376 | | } |
1377 | | #endif |