/src/libhevc/encoder/hme_search_algo.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2018 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /** |
21 | | ****************************************************************************** |
22 | | * @file hme_search_algo.c |
23 | | * |
24 | | * @brief |
25 | | * Contains various search algorithms to be used by coarse/refinement layers |
26 | | * |
27 | | * @author |
28 | | * Ittiam |
29 | | * |
30 | | * |
31 | | * List of Functions |
32 | | * hme_compute_grid_results_step_gt_1() |
33 | | * hme_compute_grid_results_step_1() |
34 | | * hme_pred_search_square_stepn() |
35 | | * |
36 | | ****************************************************************************** |
37 | | */ |
38 | | |
39 | | /*****************************************************************************/ |
40 | | /* File Includes */ |
41 | | /*****************************************************************************/ |
42 | | /* System include files */ |
43 | | #include <stdio.h> |
44 | | #include <string.h> |
45 | | #include <stdlib.h> |
46 | | #include <assert.h> |
47 | | #include <stdarg.h> |
48 | | #include <math.h> |
49 | | #include <limits.h> |
50 | | |
51 | | /* User include files */ |
52 | | #include "ihevc_typedefs.h" |
53 | | #include "itt_video_api.h" |
54 | | #include "ihevce_api.h" |
55 | | |
56 | | #include "rc_cntrl_param.h" |
57 | | #include "rc_frame_info_collector.h" |
58 | | #include "rc_look_ahead_params.h" |
59 | | |
60 | | #include "ihevc_defs.h" |
61 | | #include "ihevc_structs.h" |
62 | | #include "ihevc_platform_macros.h" |
63 | | #include "ihevc_deblk.h" |
64 | | #include "ihevc_itrans_recon.h" |
65 | | #include "ihevc_chroma_itrans_recon.h" |
66 | | #include "ihevc_chroma_intra_pred.h" |
67 | | #include "ihevc_intra_pred.h" |
68 | | #include "ihevc_inter_pred.h" |
69 | | #include "ihevc_mem_fns.h" |
70 | | #include "ihevc_padding.h" |
71 | | #include "ihevc_weighted_pred.h" |
72 | | #include "ihevc_sao.h" |
73 | | #include "ihevc_resi_trans.h" |
74 | | #include "ihevc_quant_iquant_ssd.h" |
75 | | #include "ihevc_cabac_tables.h" |
76 | | |
77 | | #include "ihevce_defs.h" |
78 | | #include "ihevce_lap_enc_structs.h" |
79 | | #include "ihevce_multi_thrd_structs.h" |
80 | | #include "ihevce_multi_thrd_funcs.h" |
81 | | #include "ihevce_me_common_defs.h" |
82 | | #include "ihevce_had_satd.h" |
83 | | #include "ihevce_error_codes.h" |
84 | | #include "ihevce_bitstream.h" |
85 | | #include "ihevce_cabac.h" |
86 | | #include "ihevce_rdoq_macros.h" |
87 | | #include "ihevce_function_selector.h" |
88 | | #include "ihevce_enc_structs.h" |
89 | | #include "ihevce_entropy_structs.h" |
90 | | #include "ihevce_cmn_utils_instr_set_router.h" |
91 | | #include "ihevce_enc_loop_structs.h" |
92 | | #include "ihevce_bs_compute_ctb.h" |
93 | | #include "ihevce_global_tables.h" |
94 | | #include "ihevce_dep_mngr_interface.h" |
95 | | #include "hme_datatype.h" |
96 | | #include "hme_interface.h" |
97 | | #include "hme_common_defs.h" |
98 | | #include "hme_defs.h" |
99 | | #include "ihevce_me_instr_set_router.h" |
100 | | #include "hme_globals.h" |
101 | | #include "hme_utils.h" |
102 | | #include "hme_coarse.h" |
103 | | #include "hme_fullpel.h" |
104 | | #include "hme_subpel.h" |
105 | | #include "hme_refine.h" |
106 | | #include "hme_err_compute.h" |
107 | | #include "hme_common_utils.h" |
108 | | #include "hme_search_algo.h" |
109 | | #include "ihevce_stasino_helpers.h" |
110 | | #include "ihevce_common_utils.h" |
111 | | |
112 | | /*****************************************************************************/ |
113 | | /* Function Definitions */ |
114 | | /*****************************************************************************/ |
115 | | |
116 | | /** |
117 | | ******************************************************************************** |
118 | | * @fn void hme_compute_grid_results_step_1(err_prms_t *ps_err_prms, |
119 | | result_upd_prms_t *ps_result_prms, |
120 | | BLK_SIZE_T e_blk_size) |
121 | | * |
122 | | * @brief Updates results for a grid of step = 1 |
123 | | * |
124 | | * @param[in] ps_err_prms: Various parameters to this function |
125 | | * |
126 | | * @param[in] ps_result_prms : Parameters pertaining to result updation |
127 | | * |
128 | | * @param[out] e_blk_size: Block size of the blk being searched for |
129 | | * |
130 | | * @return none |
131 | | ******************************************************************************** |
132 | | */ |
133 | | void hme_compute_grid_results( |
134 | | err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms, BLK_SIZE_T e_blk_size) |
135 | 14.7M | { |
136 | 14.7M | PF_RESULT_FXN_T pf_hme_result_fxn; |
137 | 14.7M | PF_SAD_FXN_T pf_sad_fxn; |
138 | 14.7M | S32 i4_num_results; |
139 | 14.7M | S32 part_id; |
140 | | |
141 | 14.7M | part_id = ps_result_prms->pi4_valid_part_ids[0]; |
142 | | |
143 | 14.7M | i4_num_results = (S32)ps_result_prms->ps_search_results->u1_num_results_per_part; |
144 | | |
145 | 14.7M | pf_sad_fxn = hme_get_sad_fxn(e_blk_size, ps_err_prms->i4_grid_mask, ps_err_prms->i4_part_mask); |
146 | | |
147 | 14.7M | pf_hme_result_fxn = |
148 | 14.7M | hme_get_result_fxn(ps_err_prms->i4_grid_mask, ps_err_prms->i4_part_mask, i4_num_results); |
149 | | |
150 | 14.7M | pf_sad_fxn(ps_err_prms); |
151 | 14.7M | pf_hme_result_fxn(ps_result_prms); |
152 | 14.7M | } |
153 | | |
154 | | /** |
155 | | ******************************************************************************** |
156 | | * @fn void hme_pred_search_square_stepn(hme_search_prms_t *ps_search_prms, |
157 | | * layer_ctxt_t *ps_layer_ctxt) |
158 | | * |
159 | | * @brief Implements predictive search, with square grid refinement. In this |
160 | | * case, we start with a bigger step size, like 4, refining upto a |
161 | | * variable number of pts, till we hit end of search range or hit a |
162 | | * minima. Then we refine using smaller steps. The bigger step size |
163 | | * like 4 or 2, do not use optimized SAD functions, they evaluate |
164 | | * SAD for each individual pt. |
165 | | * |
166 | | * @param[in,out] ps_search_prms: All the params to this function |
167 | | * |
168 | | * @param[in] ps_layer_ctxt: Context for the layer |
169 | | * |
170 | | * @return None |
171 | | ******************************************************************************** |
172 | | */ |
173 | | void hme_pred_search_square_stepn( |
174 | | hme_search_prms_t *ps_search_prms, |
175 | | layer_ctxt_t *ps_layer_ctxt, |
176 | | wgt_pred_ctxt_t *ps_wt_inp_prms, |
177 | | ME_QUALITY_PRESETS_T e_me_quality_preset, |
178 | | ihevce_me_optimised_function_list_t *ps_me_optimised_function_list |
179 | | |
180 | | ) |
181 | 6.64M | { |
182 | | /* Stores the SAD for all parts at each pt in the grid */ |
183 | 6.64M | S32 ai4_sad_grid[9][TOT_NUM_PARTS]; |
184 | | |
185 | 6.64M | S32 ai4_valid_part_ids[TOT_NUM_PARTS + 1]; |
186 | | |
187 | | /* Atributes of input candidates */ |
188 | 6.64M | search_candt_t *ps_search_candts; |
189 | 6.64M | search_node_t s_search_node; |
190 | | |
191 | | /* Number of candidates to search */ |
192 | 6.64M | S32 i4_num_candts, max_num_iters, i4_num_results; |
193 | | |
194 | | /* Input and reference attributes */ |
195 | 6.64M | S32 i4_inp_stride, i4_ref_stride, i4_ref_offset; |
196 | | |
197 | | /* The reference is actually an array of ptrs since there are several */ |
198 | | /* reference id. So an array gets passed form calling function */ |
199 | 6.64M | U08 **ppu1_ref; |
200 | | |
201 | | /* Holds the search results at the end of this fxn */ |
202 | 6.64M | search_results_t *ps_search_results; |
203 | | |
204 | | /* These control number of parts and number of pts in grid to search */ |
205 | 6.64M | S32 i4_part_mask, i4_grid_mask; |
206 | | |
207 | | /* Blk width, blk height and blk size are derived from input params */ |
208 | 6.64M | BLK_SIZE_T e_blk_size; |
209 | 6.64M | CU_SIZE_T e_cu_size; |
210 | 6.64M | S32 i4_blk_wd, i4_blk_ht, i4_step, i4_candt, i4_iter; |
211 | 6.64M | S32 i4_inp_off; |
212 | 6.64M | S32 i4_min_id; |
213 | | /* Points to the range limits for mv */ |
214 | 6.64M | range_prms_t *ps_range_prms; |
215 | | |
216 | | /*************************************************************************/ |
217 | | /* These functions pointers for calculating Err and the result update */ |
218 | | /* Each carries its own parameters structure, which is generated on the */ |
219 | | /* fly in this function */ |
220 | | /*************************************************************************/ |
221 | 6.64M | err_prms_t s_err_prms; |
222 | 6.64M | result_upd_prms_t s_result_prms; |
223 | | |
224 | 6.64M | max_num_iters = ps_search_prms->i4_max_iters; |
225 | | /* Using the member 0 to store for all ref. idx., see in coarsest */ |
226 | 6.64M | ps_range_prms = ps_search_prms->aps_mv_range[0]; |
227 | 6.64M | i4_inp_stride = ps_search_prms->i4_inp_stride; |
228 | | /* Move to the location of the search blk in inp buffer */ |
229 | 6.64M | i4_inp_off = ps_search_prms->i4_cu_x_off; |
230 | 6.64M | i4_inp_off += (ps_search_prms->i4_cu_y_off * i4_inp_stride); |
231 | | |
232 | 6.64M | ps_search_results = ps_search_prms->ps_search_results; |
233 | | |
234 | | /*************************************************************************/ |
235 | | /* Depending on flag i4_use_rec, we use either input of previously */ |
236 | | /* encoded pictures or we use recon of previously encoded pictures. */ |
237 | | /*************************************************************************/ |
238 | 6.64M | if(ps_search_prms->i4_use_rec == 1) |
239 | 0 | { |
240 | 0 | i4_ref_stride = ps_layer_ctxt->i4_rec_stride; |
241 | 0 | ppu1_ref = ps_layer_ctxt->ppu1_list_rec_fxfy; |
242 | 0 | } |
243 | 6.64M | else |
244 | 6.64M | { |
245 | 6.64M | i4_ref_stride = ps_layer_ctxt->i4_inp_stride; |
246 | 6.64M | ppu1_ref = ps_layer_ctxt->ppu1_list_inp; |
247 | 6.64M | } |
248 | 6.64M | i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off; |
249 | | |
250 | | /*************************************************************************/ |
251 | | /* Obtain the blk size of the search blk. Assumed here that the search */ |
252 | | /* is done on a CU size, rather than any arbitrary blk size. */ |
253 | | /*************************************************************************/ |
254 | 6.64M | ps_search_results = ps_search_prms->ps_search_results; |
255 | 6.64M | e_blk_size = ps_search_prms->e_blk_size; |
256 | 6.64M | i4_blk_wd = (S32)gau1_blk_size_to_wd[e_blk_size]; |
257 | 6.64M | i4_blk_ht = (S32)gau1_blk_size_to_ht[e_blk_size]; |
258 | 6.64M | e_cu_size = ps_search_results->e_cu_size; |
259 | 6.64M | i4_num_results = (S32)ps_search_results->u1_num_results_per_part; |
260 | | |
261 | 6.64M | ps_search_candts = ps_search_prms->ps_search_candts; |
262 | 6.64M | i4_num_candts = ps_search_prms->i4_num_init_candts; |
263 | 6.64M | i4_part_mask = ps_search_prms->i4_part_mask; |
264 | | |
265 | | /*************************************************************************/ |
266 | | /* This array stores the ids of the partitions whose */ |
267 | | /* SADs are updated. Since the partitions whose SADs are updated may not */ |
268 | | /* be in contiguous order, we supply another level of indirection. */ |
269 | | /*************************************************************************/ |
270 | 6.64M | hme_create_valid_part_ids(i4_part_mask, ai4_valid_part_ids); |
271 | | |
272 | | /* Update the parameters used to pass to SAD */ |
273 | | /* input ptr, strides, SAD Grid, part mask, blk width and ht */ |
274 | | /* The above are fixed ptrs, only pu1_ref and grid mask are */ |
275 | | /* varying params which are updated just before calling fxn */ |
276 | 6.64M | s_err_prms.i4_inp_stride = i4_inp_stride; |
277 | 6.64M | s_err_prms.i4_ref_stride = i4_ref_stride; |
278 | 6.64M | s_err_prms.i4_part_mask = i4_part_mask; |
279 | 6.64M | s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0]; |
280 | 6.64M | s_err_prms.i4_blk_wd = i4_blk_wd; |
281 | 6.64M | s_err_prms.i4_blk_ht = i4_blk_ht; |
282 | 6.64M | s_err_prms.pi4_valid_part_ids = ai4_valid_part_ids; |
283 | | |
284 | 6.64M | s_result_prms.pf_mv_cost_compute = ps_search_prms->pf_mv_cost_compute; |
285 | 6.64M | s_result_prms.ps_search_results = ps_search_results; |
286 | 6.64M | s_result_prms.pi4_valid_part_ids = ai4_valid_part_ids; |
287 | 6.64M | s_result_prms.i1_ref_idx = ps_search_prms->i1_ref_idx; |
288 | 6.64M | s_result_prms.i4_part_mask = ps_search_prms->i4_part_mask; |
289 | 6.64M | s_result_prms.ps_search_node_base = &s_search_node; |
290 | 6.64M | s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0]; |
291 | | |
292 | | /* Run through each of the candts in a loop */ |
293 | 34.6M | for(i4_candt = 0; i4_candt < i4_num_candts; i4_candt++) |
294 | 27.9M | { |
295 | 27.9M | S32 i4_num_refine; |
296 | | |
297 | 27.9M | i4_step = ps_search_prms->i4_start_step; |
298 | | |
299 | 27.9M | s_search_node = *(ps_search_candts->ps_search_node); |
300 | | |
301 | | /* initialize minimum cost for this candidate. As we search around */ |
302 | | /* this candidate, this is used to check early exit, when in any */ |
303 | | /* given iteration, the center pt of the grid is lowest value */ |
304 | 27.9M | s_result_prms.i4_min_cost = MAX_32BIT_VAL; |
305 | | |
306 | | /* If we need to do refinements, then we need to evaluate */ |
307 | | /* neighbouring pts. Before doing so, we have to do */ |
308 | | /* basic range checks against max allowed mvs */ |
309 | 27.9M | i4_num_refine = ps_search_candts->u1_num_steps_refine; |
310 | | |
311 | 27.9M | CLIP_MV_WITHIN_RANGE( |
312 | 27.9M | s_search_node.s_mv.i2_mvx, s_search_node.s_mv.i2_mvy, ps_range_prms, 0, 0, 0); |
313 | | |
314 | | /* The first time, we search all 8 pts around init candt plus the init candt */ |
315 | 27.9M | i4_grid_mask = 0x1ff; |
316 | 27.9M | s_err_prms.pu1_inp = ps_wt_inp_prms->apu1_wt_inp[s_search_node.i1_ref_idx] + i4_inp_off; |
317 | | |
318 | 31.9M | for(i4_iter = 0; i4_iter < max_num_iters; i4_iter++) |
319 | 27.9M | { |
320 | 27.9M | i4_grid_mask &= hme_clamp_grid_by_mvrange(&s_search_node, i4_step, ps_range_prms); |
321 | | |
322 | 27.9M | s_err_prms.i4_grid_mask = i4_grid_mask; |
323 | 27.9M | s_err_prms.pu1_ref = ppu1_ref[s_search_node.i1_ref_idx] + i4_ref_offset; |
324 | 27.9M | s_err_prms.pu1_ref += |
325 | 27.9M | (s_search_node.s_mv.i2_mvx + |
326 | 27.9M | (s_search_node.s_mv.i2_mvy * s_err_prms.i4_ref_stride)); |
327 | | |
328 | 27.9M | s_result_prms.i4_step = i4_step; |
329 | 27.9M | s_err_prms.i4_step = i4_step; |
330 | 27.9M | s_result_prms.i4_grid_mask = i4_grid_mask; |
331 | | |
332 | | /* For Top,TopLeft and Left cand., get only center point SAD */ |
333 | | /* and do early exit */ |
334 | 27.9M | if(0 == i4_num_refine) |
335 | 16.0M | { |
336 | 16.0M | s_err_prms.i4_grid_mask = 0x1; |
337 | 16.0M | s_result_prms.i4_grid_mask = 0x1; |
338 | | |
339 | | /* sad pt fun. populates sad to 0th location, whereas update */ |
340 | | /* fun. takes it based on part. id */ |
341 | 16.0M | s_err_prms.pi4_sad_grid = |
342 | 16.0M | s_result_prms.pi4_sad_grid + (1 * s_result_prms.pi4_valid_part_ids[0]); |
343 | | |
344 | 16.0M | ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit(&s_err_prms); |
345 | | |
346 | 16.0M | s_err_prms.pi4_sad_grid = s_result_prms.pi4_sad_grid; |
347 | | |
348 | 16.0M | if(ME_XTREME_SPEED_25 == e_me_quality_preset) |
349 | 0 | hme_update_results_grid_pu_bestn_xtreme_speed(&s_result_prms); |
350 | 16.0M | else |
351 | 16.0M | hme_update_results_grid_pu_bestn(&s_result_prms); |
352 | | |
353 | 16.0M | i4_min_id = (S32)PT_C; /* Center Point */ |
354 | 16.0M | i4_step = 0; /* No further refinment */ |
355 | 16.0M | s_result_prms.i4_step = i4_step; |
356 | 16.0M | s_err_prms.i4_step = i4_step; |
357 | 16.0M | } |
358 | 11.9M | else |
359 | 11.9M | { |
360 | 11.9M | if(ME_XTREME_SPEED_25 == e_me_quality_preset) |
361 | 1.30M | { |
362 | 1.30M | err_prms_t *ps_err_prms = &s_err_prms; |
363 | 1.30M | ASSERT(ps_err_prms->i4_grid_mask != 1); |
364 | 1.30M | ASSERT((ps_err_prms->i4_part_mask == 4) || (ps_err_prms->i4_part_mask == 16)); |
365 | | |
366 | | /*****************************************************************/ |
367 | | /* In this case, there are no partial updates. The blk can be */ |
368 | | /* of any type and need not be a CU. The only thing that matters */ |
369 | | /* here is the width of the blk, 4/8/(>=16) */ |
370 | | /*****************************************************************/ |
371 | 1.30M | ps_me_optimised_function_list->pf_evalsad_grid_npu_MxN(&s_err_prms); |
372 | | |
373 | 1.30M | hme_update_results_grid_pu_bestn_xtreme_speed(&s_result_prms); |
374 | 1.30M | } |
375 | 10.6M | else |
376 | 10.6M | { |
377 | | /* Obtain SAD for all 9 pts in grid*/ |
378 | 10.6M | hme_compute_grid_results(&s_err_prms, &s_result_prms, e_blk_size); |
379 | 10.6M | } |
380 | | |
381 | | /* Early exit in case of centre being local minima */ |
382 | 11.9M | i4_min_id = s_result_prms.i4_min_id; |
383 | 11.9M | } |
384 | | |
385 | 27.9M | i4_grid_mask = gai4_opt_grid_mask[i4_min_id]; |
386 | | |
387 | 27.9M | s_search_node.s_mv.i2_mvx += (i4_step * gai1_grid_id_to_x[i4_min_id]); |
388 | 27.9M | s_search_node.s_mv.i2_mvy += (i4_step * gai1_grid_id_to_y[i4_min_id]); |
389 | 27.9M | if(i4_min_id == (S32)PT_C) |
390 | 24.0M | break; |
391 | 27.9M | } |
392 | | |
393 | | /* Next keep reducing stepsize by factor of 2 */ |
394 | 27.9M | i4_step >>= 1; |
395 | 33.4M | while(i4_step) |
396 | 5.43M | { |
397 | 5.43M | i4_grid_mask = 0x1fe & |
398 | 5.43M | hme_clamp_grid_by_mvrange(&s_search_node, i4_step, ps_range_prms); |
399 | | //i4_grid_mask &= 0x1fe; |
400 | | |
401 | 5.43M | s_err_prms.i4_grid_mask = i4_grid_mask; |
402 | 5.43M | s_result_prms.i4_grid_mask = i4_grid_mask; |
403 | 5.43M | s_err_prms.i4_step = i4_step; |
404 | 5.43M | s_result_prms.i4_step = i4_step; |
405 | 5.43M | s_err_prms.pu1_ref = ppu1_ref[s_search_node.i1_ref_idx] + i4_ref_offset; |
406 | 5.43M | s_err_prms.pu1_ref += |
407 | 5.43M | (s_search_node.s_mv.i2_mvx + |
408 | 5.43M | (s_search_node.s_mv.i2_mvy * s_err_prms.i4_ref_stride)); |
409 | 5.43M | if(ME_XTREME_SPEED_25 == e_me_quality_preset) |
410 | 1.30M | { |
411 | 1.30M | err_prms_t *ps_err_prms = &s_err_prms; |
412 | 1.30M | ASSERT(ps_err_prms->i4_grid_mask != 1); |
413 | 1.30M | ASSERT((ps_err_prms->i4_part_mask == 4) || (ps_err_prms->i4_part_mask == 16)); |
414 | | |
415 | | /*****************************************************************/ |
416 | | /* In this case, there are no partial updates. The blk can be */ |
417 | | /* of any type and need not be a CU. The only thing that matters */ |
418 | | /* here is the width of the blk, 4/8/(>=16) */ |
419 | | /*****************************************************************/ |
420 | 1.30M | ps_me_optimised_function_list->pf_evalsad_grid_npu_MxN(&s_err_prms); |
421 | | |
422 | 1.30M | hme_update_results_grid_pu_bestn_xtreme_speed(&s_result_prms); |
423 | 1.30M | } |
424 | 4.12M | else |
425 | 4.12M | { |
426 | 4.12M | hme_compute_grid_results(&s_err_prms, &s_result_prms, e_blk_size); |
427 | 4.12M | } |
428 | | |
429 | 5.43M | i4_min_id = s_result_prms.i4_min_id; |
430 | | |
431 | 5.43M | s_search_node.s_mv.i2_mvx += (i4_step * gai1_grid_id_to_x[i4_min_id]); |
432 | 5.43M | s_search_node.s_mv.i2_mvy += (i4_step * gai1_grid_id_to_y[i4_min_id]); |
433 | | |
434 | 5.43M | i4_step >>= 1; |
435 | 5.43M | } |
436 | | |
437 | 27.9M | ps_search_candts++; |
438 | 27.9M | } |
439 | 6.64M | } |
440 | | |
441 | | /** |
442 | | ******************************************************************************** |
443 | | * @fn hme_pred_search_square_step1(hme_search_prms_t *ps_search_prms, |
444 | | * layer_ctxt_t *ps_layer_ctxt) |
445 | | * |
446 | | * @brief Implements predictive search with square grid refinement. In this |
447 | | * case, the square grid is of step 1 always. since this is considered |
448 | | * to be more of a refinement search |
449 | | * |
450 | | * @param[in,out] ps_search_prms: All the params to this function |
451 | | * |
452 | | * @param[in] ps_layer_ctxt: All info about this layer |
453 | | * |
454 | | * @return None |
455 | | ******************************************************************************** |
456 | | */ |
457 | | /** |
458 | | ******************************************************************************** |
459 | | * @fn hme_pred_search(hme_search_prms_t *ps_search_prms, |
460 | | * layer_ctxt_t *ps_layer_ctxt) |
461 | | * |
462 | | * @brief Implements predictive search after removing duplicate candidates |
463 | | * from initial list. Each square grid (of step 1) is expanded |
464 | | * to nine search pts before the dedeuplication process. one point |
465 | | * cost is then evaluated for each unique node after the deduplication |
466 | | * process |
467 | | * |
468 | | * @param[in,out] ps_search_prms: All the params to this function |
469 | | * |
470 | | * @param[in] ps_layer_ctxt: All info about this layer |
471 | | * |
472 | | * @return None |
473 | | ******************************************************************************** |
474 | | */ |
475 | | void hme_pred_search( |
476 | | hme_search_prms_t *ps_search_prms, |
477 | | layer_ctxt_t *ps_layer_ctxt, |
478 | | wgt_pred_ctxt_t *ps_wt_inp_prms, |
479 | | S08 i1_grid_flag, |
480 | | ihevce_me_optimised_function_list_t *ps_me_optimised_function_list |
481 | | |
482 | | ) |
483 | 3.88M | { |
484 | | /* Stores the SAD for all parts at each pt in the grid */ |
485 | 3.88M | S32 ai4_sad_grid[9 * TOT_NUM_PARTS]; |
486 | | |
487 | | /* Atributes of input candidates */ |
488 | 3.88M | search_node_t *ps_search_node; |
489 | | |
490 | 3.88M | search_results_t *ps_search_results; |
491 | 3.88M | S32 i4_num_nodes, i4_candt; |
492 | | |
493 | | /* Input and reference attributes */ |
494 | 3.88M | S32 i4_inp_stride, i4_ref_stride, i4_ref_offset; |
495 | | |
496 | | /* The reference is actually an array of ptrs since there are several */ |
497 | | /* reference id. So an array gets passed form calling function */ |
498 | 3.88M | U08 **ppu1_ref; |
499 | | |
500 | | /* These control number of parts and number of pts in grid to search */ |
501 | 3.88M | S32 i4_part_mask, i4_grid_mask; |
502 | | |
503 | 3.88M | S32 shift_for_cu_size; |
504 | | |
505 | | /* Blk width, blk height and blk size are derived from input params */ |
506 | 3.88M | BLK_SIZE_T e_blk_size; |
507 | 3.88M | CU_SIZE_T e_cu_size; |
508 | 3.88M | S32 i4_blk_wd, i4_blk_ht; |
509 | | |
510 | | /*************************************************************************/ |
511 | | /* These functions pointers for calculating Err and the result update */ |
512 | | /* Each carries its own parameters structure, which is generated on the */ |
513 | | /* fly in this function */ |
514 | | /*************************************************************************/ |
515 | 3.88M | PF_RESULT_FXN_T pf_hme_result_fxn; |
516 | 3.88M | PF_SAD_FXN_T pf_sad_fxn; |
517 | 3.88M | PF_CALC_SAD_AND_RESULT pf_calc_sad_and_result; |
518 | 3.88M | err_prms_t s_err_prms; |
519 | 3.88M | result_upd_prms_t s_result_prms; |
520 | 3.88M | S32 i4_num_results; |
521 | 3.88M | S32 i4_inp_off; |
522 | 3.88M | fullpel_refine_ctxt_t *ps_fullpel_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt; |
523 | | |
524 | 3.88M | i4_inp_stride = ps_search_prms->i4_inp_stride; |
525 | | |
526 | | /* Move to the location of the search blk in inp buffer */ |
527 | 3.88M | i4_inp_off = ps_search_prms->i4_cu_x_off; |
528 | 3.88M | i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride; |
529 | | |
530 | | /*************************************************************************/ |
531 | | /* Depending on flag i4_use_rec, we use either input of previously */ |
532 | | /* encoded pictures or we use recon of previously encoded pictures. */ |
533 | | /*************************************************************************/ |
534 | 3.88M | if(ps_search_prms->i4_use_rec == 1) |
535 | 3.88M | { |
536 | 3.88M | i4_ref_stride = ps_layer_ctxt->i4_rec_stride; |
537 | 3.88M | ppu1_ref = ps_layer_ctxt->ppu1_list_rec_fxfy; |
538 | 3.88M | } |
539 | 0 | else |
540 | 0 | { |
541 | 0 | i4_ref_stride = ps_layer_ctxt->i4_rec_stride; |
542 | 0 | ppu1_ref = ps_layer_ctxt->ppu1_list_inp; |
543 | 0 | } |
544 | 3.88M | i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off; |
545 | | /* Obtain the blk size of the search blk. Assumed here that the search */ |
546 | | /* is done on a CU size, rather than any arbitrary blk size. */ |
547 | 3.88M | ps_search_results = ps_search_prms->ps_search_results; |
548 | 3.88M | e_blk_size = ps_search_prms->e_blk_size; |
549 | 3.88M | i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; |
550 | 3.88M | i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; |
551 | 3.88M | e_cu_size = ps_search_results->e_cu_size; |
552 | | |
553 | | /* Assuming cu size of 8x8 as enum 0, the other will be 1, 2, 3 */ |
554 | | /* This will also set the shift w.r.t. the base cu size of 8x8 */ |
555 | 3.88M | shift_for_cu_size = e_cu_size; |
556 | | |
557 | 3.88M | ps_search_node = ps_search_prms->ps_search_nodes; |
558 | 3.88M | i4_num_nodes = ps_search_prms->i4_num_search_nodes; |
559 | 3.88M | i4_part_mask = ps_search_prms->i4_part_mask; |
560 | | |
561 | | /* Update the parameters used to pass to SAD */ |
562 | | /* input ptr, strides, SAD Grid, part mask, blk width and ht */ |
563 | | /* The above are fixed ptrs, only pu1_ref and grid mask are */ |
564 | | /* varying params which are updated just before calling fxn */ |
565 | 3.88M | s_err_prms.i4_inp_stride = i4_inp_stride; |
566 | 3.88M | s_err_prms.i4_ref_stride = i4_ref_stride; |
567 | 3.88M | s_err_prms.i4_part_mask = i4_part_mask; |
568 | 3.88M | s_err_prms.pi4_sad_grid = &ai4_sad_grid[0]; |
569 | 3.88M | s_err_prms.i4_blk_wd = i4_blk_wd; |
570 | 3.88M | s_err_prms.i4_blk_ht = i4_blk_ht; |
571 | 3.88M | s_err_prms.i4_step = 1; |
572 | 3.88M | s_err_prms.i4_num_partitions = ps_fullpel_refine_ctxt->i4_num_valid_parts; |
573 | | |
574 | 3.88M | s_result_prms.pf_mv_cost_compute = ps_search_prms->pf_mv_cost_compute; |
575 | 3.88M | s_result_prms.ps_search_results = ps_search_results; |
576 | 3.88M | s_result_prms.i1_ref_idx = (S08)ps_search_prms->i1_ref_idx; |
577 | 3.88M | s_result_prms.pi4_sad_grid = ai4_sad_grid; |
578 | 3.88M | s_result_prms.i4_part_mask = i4_part_mask; |
579 | 3.88M | s_result_prms.i4_step = 1; |
580 | 3.88M | pf_calc_sad_and_result = hme_get_calc_sad_and_result_fxn( |
581 | 3.88M | i1_grid_flag, |
582 | 3.88M | ps_search_prms->u1_is_cu_noisy, |
583 | 3.88M | i4_part_mask, |
584 | 3.88M | ps_fullpel_refine_ctxt->i4_num_valid_parts, |
585 | 3.88M | ps_search_results->u1_num_results_per_part); |
586 | | |
587 | 3.88M | pf_calc_sad_and_result( |
588 | 3.88M | ps_search_prms, ps_wt_inp_prms, &s_err_prms, &s_result_prms, ppu1_ref, i4_ref_stride); |
589 | 3.88M | } |
590 | | |
591 | | static __inline FT_CALC_SAD_AND_RESULT *hme_get_calc_sad_and_result_explicit_fxn( |
592 | | ihevce_me_optimised_function_list_t *ps_me_optimised_function_list, |
593 | | S32 i4_part_mask, |
594 | | S32 i4_num_partitions, |
595 | | S08 i1_grid_enable, |
596 | | U08 u1_num_results_per_part) |
597 | 4.34M | { |
598 | 4.34M | FT_CALC_SAD_AND_RESULT *pf_func = NULL; |
599 | | |
600 | 4.34M | if(2 == u1_num_results_per_part) |
601 | 1.33M | { |
602 | 1.33M | if(i4_part_mask == 1) |
603 | 800k | { |
604 | 800k | ASSERT(i4_num_partitions == 1); |
605 | | |
606 | 800k | if(i1_grid_enable == 0) |
607 | 384k | { |
608 | 384k | pf_func = |
609 | 384k | ps_me_optimised_function_list->pf_calc_pt_sad_and_2_best_results_explicit_8x8; |
610 | 384k | } |
611 | 415k | else |
612 | 415k | { |
613 | 415k | pf_func = ps_me_optimised_function_list |
614 | 415k | ->pf_calc_pt_sad_and_2_best_results_explicit_8x8_for_grid; |
615 | 415k | } |
616 | 800k | } |
617 | 535k | else |
618 | 535k | { |
619 | 535k | ASSERT(i4_num_partitions == 5); |
620 | | |
621 | 535k | pf_func = |
622 | 535k | ps_me_optimised_function_list->pf_calc_pt_sad_and_2_best_results_explicit_8x8_4x4; |
623 | 535k | } |
624 | 1.33M | } |
625 | 3.00M | else if(1 == u1_num_results_per_part) |
626 | 3.00M | { |
627 | 3.00M | if(i4_part_mask == 1) |
628 | 887k | { |
629 | 887k | ASSERT(i4_num_partitions == 1); |
630 | | |
631 | 887k | if(i1_grid_enable == 0) |
632 | 170k | { |
633 | 170k | pf_func = |
634 | 170k | ps_me_optimised_function_list->pf_calc_pt_sad_and_1_best_result_explicit_8x8; |
635 | 170k | } |
636 | 717k | else |
637 | 717k | { |
638 | 717k | pf_func = ps_me_optimised_function_list |
639 | 717k | ->pf_calc_pt_sad_and_1_best_result_explicit_8x8_for_grid; |
640 | 717k | } |
641 | 887k | } |
642 | 2.11M | else |
643 | 2.11M | { |
644 | 2.11M | ASSERT(i4_num_partitions == 5); |
645 | | |
646 | 2.11M | pf_func = |
647 | 2.11M | ps_me_optimised_function_list->pf_calc_pt_sad_and_1_best_result_explicit_8x8_4x4; |
648 | 2.11M | } |
649 | 3.00M | } |
650 | | |
651 | 4.34M | return pf_func; |
652 | 4.34M | } |
653 | | |
654 | | /** |
655 | | ******************************************************************************** |
656 | | * @fn void hme_pred_search_no_encode(hme_search_prms_t *ps_search_prms, |
657 | | * layer_ctxt_t *ps_layer_ctxt, |
658 | | * wgt_pred_ctxt_t *ps_wt_inp_prms, |
659 | | * S32 *pi4_valid_part_ids, |
660 | | * S32 disable_refine, |
661 | | * ME_QUALITY_PRESETS_T e_me_quality_preset) |
662 | | * |
663 | | * @brief Implements predictive search after removing duplicate candidates |
664 | | * from initial list. Each square grid (of step 1) is expanded |
665 | | * to nine search pts before the dedeuplication process. one point |
666 | | * cost is then evaluated for each unique node after the deduplication |
667 | | * process |
668 | | * |
669 | | * @param[in,out] ps_search_prms: All the params to this function |
670 | | * |
671 | | * @param[in] ps_layer_ctxt: All info about this layer |
672 | | * |
673 | | * @return None |
674 | | ******************************************************************************** |
675 | | */ |
676 | | void hme_pred_search_no_encode( |
677 | | hme_search_prms_t *ps_search_prms, |
678 | | layer_ctxt_t *ps_layer_ctxt, |
679 | | wgt_pred_ctxt_t *ps_wt_inp_prms, |
680 | | S32 *pi4_valid_part_ids, |
681 | | S32 disable_refine, |
682 | | ME_QUALITY_PRESETS_T e_me_quality_preset, |
683 | | S08 i1_grid_enable, |
684 | | ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) |
685 | 4.34M | { |
686 | | /* Stores the SAD for all parts at each pt in the grid */ |
687 | 4.34M | S32 ai4_sad_grid[9 * TOT_NUM_PARTS]; |
688 | | |
689 | | /* Atributes of input candidates */ |
690 | 4.34M | search_node_t *ps_search_node; |
691 | 4.34M | search_results_t *ps_search_results; |
692 | 4.34M | S32 i4_num_nodes; |
693 | | |
694 | | /* Input and reference attributes */ |
695 | 4.34M | S32 i4_inp_stride, i4_ref_stride, i4_ref_offset; |
696 | | |
697 | | /* The reference is actually an array of ptrs since there are several */ |
698 | | /* reference id. So an array gets passed form calling function */ |
699 | 4.34M | U08 **ppu1_ref; |
700 | | |
701 | | /* These control number of parts and number of pts in grid to search */ |
702 | 4.34M | S32 i4_part_mask; // i4_grid_mask; |
703 | | |
704 | 4.34M | S32 shift_for_cu_size; |
705 | | /* Blk width, blk height and blk size are derived from input params */ |
706 | 4.34M | BLK_SIZE_T e_blk_size; |
707 | 4.34M | CU_SIZE_T e_cu_size; |
708 | 4.34M | S32 i4_blk_wd, i4_blk_ht; |
709 | | |
710 | | /*************************************************************************/ |
711 | | /* These functions pointers for calculating Err and the result update */ |
712 | | /* Each carries its own parameters structure, which is generated on the */ |
713 | | /* fly in this function */ |
714 | | /*************************************************************************/ |
715 | 4.34M | PF_CALC_SAD_AND_RESULT pf_calc_sad_and_result; |
716 | 4.34M | err_prms_t s_err_prms; |
717 | 4.34M | result_upd_prms_t s_result_prms; |
718 | 4.34M | S32 i4_num_results; |
719 | 4.34M | S32 i4_search_idx = ps_search_prms->i1_ref_idx; |
720 | 4.34M | S32 i4_inp_off; |
721 | 4.34M | S32 i4_num_partitions; |
722 | | |
723 | 4.34M | i4_inp_stride = ps_search_prms->i4_inp_stride; |
724 | | |
725 | | /* Move to the location of the search blk in inp buffer */ |
726 | 4.34M | i4_inp_off = ps_search_prms->i4_cu_x_off; |
727 | 4.34M | i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride; |
728 | | |
729 | | /*************************************************************************/ |
730 | | /* Depending on flag i4_use_rec, we use either input of previously */ |
731 | | /* encoded pictures or we use recon of previously encoded pictures. */ |
732 | | /*************************************************************************/ |
733 | 4.34M | if(ps_search_prms->i4_use_rec == 1) |
734 | 0 | { |
735 | 0 | i4_ref_stride = ps_layer_ctxt->i4_rec_stride; |
736 | 0 | ppu1_ref = ps_layer_ctxt->ppu1_list_rec_fxfy; |
737 | 0 | } |
738 | 4.34M | else |
739 | 4.34M | { |
740 | 4.34M | i4_ref_stride = ps_layer_ctxt->i4_inp_stride; |
741 | 4.34M | ppu1_ref = ps_layer_ctxt->ppu1_list_inp; |
742 | 4.34M | } |
743 | 4.34M | i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off; |
744 | | /* Obtain the blk size of the search blk. Assumed here that the search */ |
745 | | /* is done on a CU size, rather than any arbitrary blk size. */ |
746 | 4.34M | ps_search_results = ps_search_prms->ps_search_results; |
747 | 4.34M | e_blk_size = ps_search_prms->e_blk_size; |
748 | 4.34M | i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; |
749 | 4.34M | i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; |
750 | 4.34M | e_cu_size = ps_search_results->e_cu_size; |
751 | | |
752 | | /* Assuming cu size of 8x8 as enum 0, the other will be 1, 2, 3 */ |
753 | | /* This will also set the shift w.r.t. the base cu size of 8x8 */ |
754 | 4.34M | shift_for_cu_size = e_cu_size; |
755 | | |
756 | 4.34M | ps_search_node = ps_search_prms->ps_search_nodes; |
757 | 4.34M | i4_num_nodes = ps_search_prms->i4_num_search_nodes; |
758 | 4.34M | i4_part_mask = ps_search_prms->i4_part_mask; |
759 | | |
760 | | /*************************************************************************/ |
761 | | /* This array stores the ids of the partitions whose */ |
762 | | /* SADs are updated. Since the partitions whose SADs are updated may not */ |
763 | | /* be in contiguous order, we supply another level of indirection. */ |
764 | | /*************************************************************************/ |
765 | 4.34M | i4_num_partitions = hme_create_valid_part_ids(i4_part_mask, pi4_valid_part_ids); |
766 | | |
767 | | /* Update the parameters used to pass to SAD */ |
768 | | /* input ptr, strides, SAD Grid, part mask, blk width and ht */ |
769 | | /* The above are fixed ptrs, only pu1_ref and grid mask are */ |
770 | | /* varying params which are updated just before calling fxn */ |
771 | 4.34M | s_err_prms.i4_inp_stride = i4_inp_stride; |
772 | 4.34M | s_err_prms.i4_ref_stride = i4_ref_stride; |
773 | 4.34M | s_err_prms.i4_part_mask = i4_part_mask; |
774 | 4.34M | s_err_prms.pi4_sad_grid = &ai4_sad_grid[0]; |
775 | 4.34M | s_err_prms.i4_blk_wd = i4_blk_wd; |
776 | 4.34M | s_err_prms.i4_blk_ht = i4_blk_ht; |
777 | 4.34M | s_err_prms.i4_step = 1; |
778 | 4.34M | s_err_prms.pi4_valid_part_ids = pi4_valid_part_ids; |
779 | 4.34M | s_err_prms.i4_num_partitions = i4_num_partitions; |
780 | | |
781 | 4.34M | s_result_prms.pf_mv_cost_compute = ps_search_prms->pf_mv_cost_compute; |
782 | 4.34M | s_result_prms.ps_search_results = ps_search_results; |
783 | 4.34M | s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids; |
784 | 4.34M | s_result_prms.i1_ref_idx = (S08)ps_search_prms->i1_ref_idx; |
785 | 4.34M | s_result_prms.pi4_sad_grid = ai4_sad_grid; |
786 | 4.34M | s_result_prms.i4_part_mask = i4_part_mask; |
787 | 4.34M | s_result_prms.i4_step = 1; |
788 | | |
789 | 4.34M | pf_calc_sad_and_result = hme_get_calc_sad_and_result_explicit_fxn( |
790 | 4.34M | ps_me_optimised_function_list, |
791 | 4.34M | i4_part_mask, |
792 | 4.34M | i4_num_partitions, |
793 | 4.34M | i1_grid_enable, |
794 | 4.34M | ps_search_results->u1_num_results_per_part); |
795 | | |
796 | 4.34M | pf_calc_sad_and_result( |
797 | 4.34M | ps_search_prms, ps_wt_inp_prms, &s_err_prms, &s_result_prms, ppu1_ref, i4_ref_stride); |
798 | 4.34M | } |