Coverage Report

Created: 2025-12-14 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/hme_subpel.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/**
22
******************************************************************************
23
* @file hme_subpel.c
24
*
25
* @brief
26
*    Subpel refinement modules for ME algo
27
*
28
* @author
29
*    Ittiam
30
*
31
*
32
* List of Functions
33
* hme_qpel_interp_avg()
34
* hme_subpel_refine_ctblist_bck()
35
* hme_subpel_refine_ctblist_fwd()
36
* hme_refine_bidirect()
37
* hme_subpel_refinement()
38
* hme_subpel_refine_ctb_fwd()
39
* hme_subpel_refine_ctb_bck()
40
* hme_create_bck_inp()
41
* hme_subpel_refine_search_node()
42
******************************************************************************
43
*/
44
45
/*****************************************************************************/
46
/* File Includes                                                             */
47
/*****************************************************************************/
48
/* System include files */
49
#include <stdio.h>
50
#include <string.h>
51
#include <stdlib.h>
52
#include <assert.h>
53
#include <stdarg.h>
54
#include <math.h>
55
#include <limits.h>
56
57
/* User include files */
58
#include "ihevc_typedefs.h"
59
#include "itt_video_api.h"
60
#include "ihevce_api.h"
61
62
#include "rc_cntrl_param.h"
63
#include "rc_frame_info_collector.h"
64
#include "rc_look_ahead_params.h"
65
66
#include "ihevc_defs.h"
67
#include "ihevc_structs.h"
68
#include "ihevc_platform_macros.h"
69
#include "ihevc_deblk.h"
70
#include "ihevc_itrans_recon.h"
71
#include "ihevc_chroma_itrans_recon.h"
72
#include "ihevc_chroma_intra_pred.h"
73
#include "ihevc_intra_pred.h"
74
#include "ihevc_inter_pred.h"
75
#include "ihevc_mem_fns.h"
76
#include "ihevc_padding.h"
77
#include "ihevc_weighted_pred.h"
78
#include "ihevc_sao.h"
79
#include "ihevc_resi_trans.h"
80
#include "ihevc_quant_iquant_ssd.h"
81
#include "ihevc_cabac_tables.h"
82
83
#include "ihevce_defs.h"
84
#include "ihevce_lap_enc_structs.h"
85
#include "ihevce_multi_thrd_structs.h"
86
#include "ihevce_multi_thrd_funcs.h"
87
#include "ihevce_me_common_defs.h"
88
#include "ihevce_had_satd.h"
89
#include "ihevce_error_codes.h"
90
#include "ihevce_bitstream.h"
91
#include "ihevce_cabac.h"
92
#include "ihevce_rdoq_macros.h"
93
#include "ihevce_function_selector.h"
94
#include "ihevce_enc_structs.h"
95
#include "ihevce_entropy_structs.h"
96
#include "ihevce_cmn_utils_instr_set_router.h"
97
#include "ihevce_enc_loop_structs.h"
98
#include "ihevce_bs_compute_ctb.h"
99
#include "ihevce_global_tables.h"
100
#include "ihevce_dep_mngr_interface.h"
101
#include "hme_datatype.h"
102
#include "hme_interface.h"
103
#include "hme_common_defs.h"
104
#include "hme_defs.h"
105
#include "ihevce_me_instr_set_router.h"
106
#include "hme_globals.h"
107
#include "hme_utils.h"
108
#include "hme_coarse.h"
109
#include "hme_fullpel.h"
110
#include "hme_subpel.h"
111
#include "hme_refine.h"
112
#include "hme_err_compute.h"
113
#include "hme_common_utils.h"
114
#include "hme_search_algo.h"
115
#include "ihevce_stasino_helpers.h"
116
#include "ihevce_common_utils.h"
117
118
/*****************************************************************************/
119
/* Function Definitions                                                      */
120
/*****************************************************************************/
121
void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
122
9.45M
{
123
9.45M
    U08 *pu1_src1, *pu1_src2, *pu1_dst;
124
9.45M
    qpel_input_buf_cfg_t *ps_inp_cfg;
125
9.45M
    S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
126
127
    /*************************************************************************/
128
    /* For a given QPEL pt, we need to determine the 2 source pts that are   */
129
    /* needed to do the QPEL averaging. The logic to do this is as follows   */
130
    /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
131
    /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
132
    /* pt of th reference blk that is colocated to the inp blk.              */
133
    /*    A j E k B                                                          */
134
    /*    l m n o p                                                          */
135
    /*    F q G r H                                                          */
136
    /*    s t u v w                                                          */
137
    /*    C x I y D                                                          */
138
    /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
139
    /* and (1,1) respectively in the fpel buffer (id = 0)                    */
140
    /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
141
    /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
142
    /* G is hxhy pt in offset 0,0 in hxhy buf                                */
143
    /* All above offsets are computed w.r.t. motion displaced pt in          */
144
    /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
145
    /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
146
    /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
147
    /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
148
    /* v is avg of H and I. So the table look up of v should give following  */
149
    /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
150
    /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
151
    /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
152
    /*************************************************************************/
153
9.45M
    i4_mv_x_frac = i4_mv_x & 3;
154
9.45M
    i4_mv_y_frac = i4_mv_y & 3;
155
156
9.45M
    i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
157
158
    /* Derive the descriptor that has all offset and size info */
159
9.45M
    ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
160
161
9.45M
    if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
162
8.94M
    {
163
        /* This is case for fxfy/hxfy/fxhy/hxhy */
164
8.94M
        ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
165
8.94M
        ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
166
8.94M
        ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
167
8.94M
        ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
168
169
8.94M
        return;
170
8.94M
    }
171
172
504k
    pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
173
504k
    pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
174
504k
    pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
175
176
504k
    pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
177
504k
    pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
178
504k
    pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
179
180
504k
    pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
181
504k
    hevc_avg_2d(
182
504k
        pu1_src1,
183
504k
        pu1_src2,
184
504k
        ps_prms->i4_ref_stride,
185
504k
        ps_prms->i4_ref_stride,
186
504k
        ps_prms->i4_blk_wd,
187
504k
        ps_prms->i4_blk_ht,
188
504k
        pu1_dst,
189
504k
        ps_prms->i4_out_stride);
190
504k
    ps_prms->pu1_final_out = pu1_dst;
191
504k
    ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
192
504k
}
193
194
static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
195
    interp_prms_t *ps_prms,
196
    S32 i4_mv_x,
197
    S32 i4_mv_y,
198
    U08 **ppu1_final,
199
    S32 *pi4_final_stride,
200
    FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
201
35.3k
{
202
35.3k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
203
204
35.3k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
205
35.3k
}
206
207
static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
208
    interp_prms_t *ps_prms,
209
    S32 i4_mv_x,
210
    S32 i4_mv_y,
211
    U08 **ppu1_final,
212
    S32 *pi4_final_stride,
213
    FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
214
26.7k
{
215
26.7k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
216
217
26.7k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
218
26.7k
}
219
220
/********************************************************************************
221
*  @fn     hme_qpel_interp_comprehensive
222
*
223
*  @brief  Interpolates 2 qpel points by hpel averaging
224
*
225
*  @param[in,out]  ps_prms: Both input buffer ptrs and location of output
226
*
227
*  @param[in]  i4_mv_x : x component of motion vector in QPEL units
228
*
229
*  @param[in]  i4_mv_y : y component of motion vector in QPEL units
230
*
231
*  @param[in]  i4_grid_mask : mask which determines qpels to be computed
232
*
233
*  @param[out]  ppu1_final : storage for final buffer pointers
234
*
235
*  @param[out]  pi4_final_stride : storage for final buffer strides
236
*
237
*  @return None
238
********************************************************************************
239
*/
240
static __inline void hme_qpel_interp_comprehensive(
241
    interp_prms_t *ps_prms,
242
    U08 **ppu1_final,
243
    S32 *pi4_final_stride,
244
    S32 i4_mv_x,
245
    S32 i4_mv_y,
246
    S32 i4_grid_mask,
247
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
248
2.81M
{
249
2.81M
    S32 pt_select_for_TB, pt_select_for_LR;
250
2.81M
    S32 dx, dy, dydx;
251
2.81M
    S32 vert_func_selector, horz_func_selector;
252
253
2.81M
    S32 i4_ref_stride = ps_prms->i4_ref_stride;
254
255
2.81M
    pt_select_for_TB =
256
2.81M
        ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
257
258
2.81M
    pt_select_for_LR =
259
2.81M
        ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
260
261
2.81M
    dx = (i4_mv_x & 3);
262
2.81M
    dy = (i4_mv_y & 3);
263
2.81M
    dydx = (dx + (dy << 2));
264
265
2.81M
    vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
266
2.81M
    horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
267
268
    /* case descriptions */
269
    /* Let T = (gridmask & T) & B = (gridmask & B) */
270
    /* & hp = pt is an hpel or an fpel */
271
    /* & r = reuse possible */
272
    /* 0 => T || B = 0 */
273
    /* 1 => (!T) && (B) && hp */
274
    /* 2 => (T) && (!B) && hp */
275
    /* 3 => (!T) && (B) && !hp */
276
    /* 4 => (T) && (!B) && !hp */
277
    /* 5 => (T) && (B) && !hp && r */
278
    /* 6 => (T) && (B) && !hp && !r */
279
    /* 7 => (T) && (B) && hp */
280
281
2.81M
    switch(vert_func_selector)
282
2.81M
    {
283
15
    case 0:
284
15
    {
285
15
        break;
286
0
    }
287
61.5k
    case 1:
288
61.5k
    {
289
61.5k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
290
61.5k
        qpel_input_buf_cfg_t *ps_inp_cfg;
291
61.5k
        S32 i4_mvyp1 = (i4_mv_y + 1);
292
293
61.5k
        i4_mv_x_frac = dx;
294
61.5k
        i4_mv_y_frac = i4_mvyp1 & 3;
295
296
61.5k
        i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
297
298
        /* Derive the descriptor that has all offset and size info */
299
61.5k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
300
301
61.5k
        ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
302
61.5k
        ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
303
61.5k
        ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
304
61.5k
        pi4_final_stride[3] = i4_ref_stride;
305
306
61.5k
        break;
307
0
    }
308
59.1k
    case 2:
309
59.1k
    {
310
59.1k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
311
59.1k
        qpel_input_buf_cfg_t *ps_inp_cfg;
312
59.1k
        S32 i4_mvym1 = (i4_mv_y - 1);
313
314
59.1k
        i4_mv_x_frac = dx;
315
59.1k
        i4_mv_y_frac = i4_mvym1 & 3;
316
317
59.1k
        i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
318
319
        /* Derive the descriptor that has all offset and size info */
320
59.1k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
321
322
59.1k
        ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
323
59.1k
        ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
324
59.1k
        ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
325
59.1k
        pi4_final_stride[1] = i4_ref_stride;
326
327
59.1k
        break;
328
0
    }
329
17.7k
    case 3:
330
17.7k
    {
331
17.7k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
332
17.7k
            ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
333
334
17.7k
        break;
335
0
    }
336
15.6k
    case 4:
337
15.6k
    {
338
15.6k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
339
15.6k
            ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
340
341
15.6k
        break;
342
0
    }
343
2.62M
    case 5:
344
2.62M
    {
345
2.62M
        ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
346
2.62M
            ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
347
2.62M
        break;
348
0
    }
349
35.3k
    case 6:
350
35.3k
    {
351
35.3k
        hme_qpel_interp_avg_2pt_vert_no_reuse(
352
35.3k
            ps_prms,
353
35.3k
            i4_mv_x,
354
35.3k
            i4_mv_y,
355
35.3k
            ppu1_final,
356
35.3k
            pi4_final_stride,
357
35.3k
            ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
358
35.3k
        break;
359
0
    }
360
0
    case 7:
361
0
    {
362
0
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
363
0
        qpel_input_buf_cfg_t *ps_inp_cfg;
364
365
0
        S32 i4_mvyp1 = (i4_mv_y + 1);
366
0
        S32 i4_mvym1 = (i4_mv_y - 1);
367
368
0
        i4_mv_x_frac = dx;
369
0
        i4_mv_y_frac = i4_mvyp1 & 3;
370
371
0
        i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
372
373
        /* Derive the descriptor that has all offset and size info */
374
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
375
376
0
        ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
377
0
        ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
378
0
        ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
379
0
        pi4_final_stride[3] = i4_ref_stride;
380
381
0
        i4_mv_y_frac = i4_mvym1 & 3;
382
383
0
        i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
384
385
        /* Derive the descriptor that has all offset and size info */
386
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
387
388
0
        ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
389
0
        ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
390
0
        ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
391
0
        pi4_final_stride[1] = i4_ref_stride;
392
393
0
        break;
394
0
    }
395
2.81M
    }
396
397
    /* case descriptions */
398
    /* Let L = (gridmask & L) & R = (gridmask & R) */
399
    /* & hp = pt is an hpel or an fpel */
400
    /* & r = reuse possible */
401
    /* 0 => L || R = 0 */
402
    /* 1 => (!L) && (R) && hp */
403
    /* 2 => (L) && (!R) && hp */
404
    /* 3 => (!L) && (R) && !hp */
405
    /* 4 => (L) && (!R) && !hp */
406
    /* 5 => (L) && (R) && !hp && r */
407
    /* 6 => (L) && (R) && !hp && !r */
408
    /* 7 => (L) && (R) && hp */
409
410
2.81M
    switch(horz_func_selector)
411
2.81M
    {
412
0
    case 0:
413
0
    {
414
0
        break;
415
0
    }
416
58.1k
    case 1:
417
58.1k
    {
418
58.1k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
419
58.1k
        qpel_input_buf_cfg_t *ps_inp_cfg;
420
58.1k
        S32 i4_mvxp1 = (i4_mv_x + 1);
421
422
58.1k
        i4_mv_x_frac = i4_mvxp1 & 3;
423
58.1k
        i4_mv_y_frac = dy;
424
425
58.1k
        i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
426
427
        /* Derive the descriptor that has all offset and size info */
428
58.1k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
429
430
58.1k
        ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
431
58.1k
        ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
432
58.1k
        ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
433
58.1k
        pi4_final_stride[2] = i4_ref_stride;
434
435
58.1k
        break;
436
0
    }
437
50.3k
    case 2:
438
50.3k
    {
439
50.3k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
440
50.3k
        qpel_input_buf_cfg_t *ps_inp_cfg;
441
50.3k
        S32 i4_mvxm1 = (i4_mv_x - 1);
442
443
50.3k
        i4_mv_x_frac = i4_mvxm1 & 3;
444
50.3k
        i4_mv_y_frac = dy;
445
446
50.3k
        i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
447
448
        /* Derive the descriptor that has all offset and size info */
449
50.3k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
450
451
50.3k
        ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
452
50.3k
        ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
453
50.3k
        ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
454
50.3k
        pi4_final_stride[0] = i4_ref_stride;
455
456
50.3k
        break;
457
0
    }
458
25.4k
    case 3:
459
25.4k
    {
460
25.4k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
461
25.4k
            ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
462
463
25.4k
        break;
464
0
    }
465
20.1k
    case 4:
466
20.1k
    {
467
20.1k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
468
20.1k
            ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
469
470
20.1k
        break;
471
0
    }
472
2.63M
    case 5:
473
2.63M
    {
474
2.63M
        ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
475
2.63M
            ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
476
2.63M
        break;
477
0
    }
478
26.7k
    case 6:
479
26.7k
    {
480
26.7k
        hme_qpel_interp_avg_2pt_horz_no_reuse(
481
26.7k
            ps_prms,
482
26.7k
            i4_mv_x,
483
26.7k
            i4_mv_y,
484
26.7k
            ppu1_final,
485
26.7k
            pi4_final_stride,
486
26.7k
            ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
487
26.7k
        break;
488
0
    }
489
0
    case 7:
490
0
    {
491
0
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
492
0
        qpel_input_buf_cfg_t *ps_inp_cfg;
493
494
0
        S32 i4_mvxp1 = (i4_mv_x + 1);
495
0
        S32 i4_mvxm1 = (i4_mv_x - 1);
496
497
0
        i4_mv_x_frac = i4_mvxp1 & 3;
498
0
        i4_mv_y_frac = dy;
499
500
0
        i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
501
502
        /* Derive the descriptor that has all offset and size info */
503
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
504
505
0
        ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
506
0
        ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
507
0
        ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
508
0
        pi4_final_stride[2] = i4_ref_stride;
509
510
0
        i4_mv_x_frac = i4_mvxm1 & 3;
511
512
0
        i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
513
514
        /* Derive the descriptor that has all offset and size info */
515
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
516
517
0
        ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
518
0
        ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
519
0
        ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
520
0
        pi4_final_stride[0] = i4_ref_stride;
521
522
0
        break;
523
0
    }
524
2.81M
    }
525
2.81M
}
526
527
/**
528
********************************************************************************
529
*  @fn     S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
530
*                                   search_results_t *ps_search_results,
531
*                                   layer_ctxt_t *ps_curr_layer,
532
*                                   U08 **ppu1_pred)
533
*
534
*
535
*  @brief  Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
536
*          best L0 and L1 bufs respectively for the entire CU
537
*
538
*  @param[in]  ps_prms: subpel prms input to this function
539
*
540
*  @param[in] ps_curr_layer: points to the current layer ctxt
541
*
542
*  @return The best BI cost of best uni cost, whichever better
543
********************************************************************************
544
*/
545
void hme_compute_pred_and_evaluate_bi(
546
    inter_cu_results_t *ps_cu_results,
547
    inter_pu_results_t *ps_pu_results,
548
    inter_ctb_prms_t *ps_inter_ctb_prms,
549
    part_type_results_t *ps_part_type_result,
550
    ULWORD64 *pu8_winning_pred_sigmaXSquare,
551
    ULWORD64 *pu8_winning_pred_sigmaX,
552
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
553
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
554
5.56M
{
555
    /* Idx0 - Uni winner */
556
    /* Idx1 - Uni runner-up */
557
    /* Idx2 - Bi winner */
558
5.56M
    hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
559
5.56M
    err_prms_t s_err_prms;
560
5.56M
    interp_prms_t s_interp_prms;
561
562
5.56M
    PF_SAD_FXN_T pf_err_compute;
563
564
5.56M
    S32 i, j;
565
5.56M
    S32 x_off, y_off, x_pic, y_pic;
566
5.56M
    S32 i4_sad_grid;
567
5.56M
    U08 e_cu_size;
568
5.56M
    S32 i4_part_type;
569
5.56M
    U08 u1_cu_size;
570
5.56M
    S32 shift;
571
5.56M
    S32 x_part, y_part, num_parts;
572
5.56M
    S32 inp_stride, ref_stride;
573
5.56M
    U08 au1_pred_buf_array_indixes[3];
574
5.56M
    S32 cur_iter_best_cost;
575
5.56M
    S32 uni_cost, bi_cost, best_cost, tot_cost;
576
    /* Idx0 - Uni winner */
577
    /* Idx1 - Bi winner */
578
5.56M
    ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
579
5.56M
    ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
580
5.56M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
581
5.56M
    S32 i4_noise_term;
582
5.56M
#endif
583
584
5.56M
    interp_prms_t *ps_interp_prms = &s_interp_prms;
585
586
5.56M
    S32 best_cand_in_opp_dir_idx = 0;
587
5.56M
    S32 is_best_cand_an_intra = 0;
588
5.56M
    U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
589
5.56M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
590
5.56M
    const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
591
5.56M
#endif
592
5.56M
    tot_cost = 0;
593
594
    /* Start of the CU w.r.t. CTB */
595
5.56M
    x_off = ps_cu_results->u1_x_off;
596
5.56M
    y_off = ps_cu_results->u1_y_off;
597
598
5.56M
    inp_stride = ps_inter_ctb_prms->i4_inp_stride;
599
5.56M
    ref_stride = ps_inter_ctb_prms->i4_rec_stride;
600
601
5.56M
    ps_interp_prms->i4_ref_stride = ref_stride;
602
603
    /* Start of the CU w.r.t. Pic 0,0 */
604
5.56M
    x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
605
5.56M
    y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
606
607
5.56M
    u1_cu_size = ps_cu_results->u1_cu_size;
608
5.56M
    e_cu_size = u1_cu_size;
609
5.56M
    shift = (S32)e_cu_size;
610
5.56M
    i4_part_type = ps_part_type_result->u1_part_type;
611
5.56M
    num_parts = gau1_num_parts_in_part_type[i4_part_type];
612
613
22.2M
    for(i = 0; i < 3; i++)
614
16.6M
    {
615
16.6M
        hme_init_pred_buf_info(
616
16.6M
            &as_pred_buf_data[i],
617
16.6M
            &ps_inter_ctb_prms->s_pred_buf_mngr,
618
16.6M
            (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
619
16.6M
            (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
620
16.6M
            (PART_TYPE_T)i4_part_type);
621
622
16.6M
        au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
623
16.6M
    }
624
625
12.5M
    for(j = 0; j < num_parts; j++)
626
6.98M
    {
627
6.98M
        UWORD8 *apu1_hpel_ref[2][4];
628
6.98M
        PART_ID_T e_part_id;
629
6.98M
        BLK_SIZE_T e_blk_size;
630
6.98M
        WORD8 i1_ref_idx;
631
6.98M
        UWORD8 pred_dir;
632
6.98M
        WORD32 ref_offset, inp_offset, wd, ht;
633
6.98M
        pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
634
6.98M
        mv_t *aps_mv[2];
635
6.98M
        UWORD8 num_active_ref_opp;
636
6.98M
        UWORD8 num_results_per_part;
637
6.98M
        WORD32 luma_weight_ref1, luma_offset_ref1;
638
6.98M
        WORD32 luma_weight_ref2, luma_offset_ref2;
639
6.98M
        WORD32 pu_node2_found = 0;
640
641
6.98M
        e_part_id = ge_part_type_to_part_id[i4_part_type][j];
642
6.98M
        e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
643
644
6.98M
        x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
645
6.98M
        y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
646
647
6.98M
        ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
648
6.98M
        inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
649
650
6.98M
        pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
651
652
6.98M
        ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
653
654
6.98M
        if(PRED_L0 == pred_dir)
655
6.36M
        {
656
6.36M
            i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
657
6.36M
            aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
658
659
6.36M
            num_active_ref_opp =
660
6.36M
                ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
661
6.36M
            num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
662
663
6.36M
            ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
664
665
6.36M
            ASSERT(i1_ref_idx >= 0);
666
667
6.36M
            apu1_hpel_ref[0][0] =
668
6.36M
                (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
669
6.36M
                ref_offset;
670
6.36M
            apu1_hpel_ref[0][1] =
671
6.36M
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
672
6.36M
                ref_offset;
673
6.36M
            apu1_hpel_ref[0][2] =
674
6.36M
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
675
6.36M
                ref_offset;
676
6.36M
            apu1_hpel_ref[0][3] =
677
6.36M
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
678
6.36M
                ref_offset;
679
680
6.36M
            luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
681
6.36M
                                   ->s_weight_offset.i2_luma_weight;
682
6.36M
            luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
683
6.36M
                                   ->s_weight_offset.i2_luma_offset;
684
6.36M
        }
685
613k
        else
686
613k
        {
687
613k
            i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
688
613k
            aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
689
690
613k
            ASSERT(i1_ref_idx >= 0);
691
692
613k
            num_active_ref_opp =
693
613k
                ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
694
613k
            num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
695
696
613k
            ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
697
698
613k
            apu1_hpel_ref[0][0] =
699
613k
                (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
700
613k
                ref_offset;
701
613k
            apu1_hpel_ref[0][1] =
702
613k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
703
613k
                ref_offset;
704
613k
            apu1_hpel_ref[0][2] =
705
613k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
706
613k
                ref_offset;
707
613k
            apu1_hpel_ref[0][3] =
708
613k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
709
613k
                ref_offset;
710
711
613k
            luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
712
613k
                                   ->s_weight_offset.i2_luma_weight;
713
613k
            luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
714
613k
                                   ->s_weight_offset.i2_luma_offset;
715
613k
        }
716
717
6.98M
        if(aps_mv[0]->i2_mvx == INTRA_MV)
718
0
        {
719
0
            uni_cost = ps_pu_node1->i4_tot_cost;
720
0
            cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
721
0
            best_cost = MIN(uni_cost, cur_iter_best_cost);
722
0
            tot_cost += best_cost;
723
0
            continue;
724
0
        }
725
726
6.98M
        ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
727
6.98M
        ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
728
6.98M
        ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
729
730
6.98M
        if(num_active_ref_opp)
731
1.91M
        {
732
1.91M
            if(PRED_L0 == pred_dir)
733
1.35M
            {
734
1.35M
                if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
735
1.25M
                {
736
1.25M
                    ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
737
1.25M
                    pu_node2_found = 1;
738
1.25M
                }
739
1.35M
            }
740
567k
            else
741
567k
            {
742
567k
                if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
743
566k
                {
744
566k
                    ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
745
566k
                    pu_node2_found = 1;
746
566k
                }
747
567k
            }
748
1.91M
        }
749
750
6.98M
        if(!pu_node2_found)
751
5.16M
        {
752
5.16M
            bi_cost = INT_MAX >> 1;
753
754
5.16M
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
755
5.16M
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
756
757
5.16M
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
758
5.16M
                ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
759
760
5.16M
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
761
4.87M
            {
762
4.87M
                as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
763
4.87M
                as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
764
4.87M
                as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
765
4.87M
            }
766
767
5.16M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
768
0
            {
769
0
                hme_compute_sigmaX_and_sigmaXSquared(
770
0
                    as_pred_buf_data[0][j].pu1_pred,
771
0
                    as_pred_buf_data[0][j].i4_pred_stride,
772
0
                    &au8_sigmaX[0][j],
773
0
                    &au8_sigmaXSquared[0][j],
774
0
                    ps_interp_prms->i4_blk_wd,
775
0
                    ps_interp_prms->i4_blk_ht,
776
0
                    ps_interp_prms->i4_blk_wd,
777
0
                    ps_interp_prms->i4_blk_ht,
778
0
                    0,
779
0
                    1);
780
0
            }
781
5.16M
        }
782
1.81M
        else
783
1.81M
        {
784
1.81M
            i = 0;
785
1.81M
            bi_cost = MAX_32BIT_VAL;
786
1.81M
            is_best_cand_an_intra = 0;
787
1.81M
            best_cand_in_opp_dir_idx = 0;
788
789
1.81M
            pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
790
791
1.81M
            if(PRED_L0 == pred_dir)
792
566k
            {
793
566k
                i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
794
566k
                aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
795
796
566k
                ASSERT(i1_ref_idx >= 0);
797
798
566k
                apu1_hpel_ref[1][0] =
799
566k
                    (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
800
566k
                                   ->s_yuv_buf_desc.pv_y_buf) +
801
566k
                    ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
802
566k
                apu1_hpel_ref[1][1] =
803
566k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
804
566k
                    ref_offset;
805
566k
                apu1_hpel_ref[1][2] =
806
566k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
807
566k
                    ref_offset;
808
566k
                apu1_hpel_ref[1][3] =
809
566k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
810
566k
                    ref_offset;
811
812
566k
                luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
813
566k
                                       ->s_weight_offset.i2_luma_weight;
814
566k
                luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
815
566k
                                       ->s_weight_offset.i2_luma_offset;
816
566k
            }
817
1.25M
            else
818
1.25M
            {
819
1.25M
                i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
820
1.25M
                aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
821
822
1.25M
                ASSERT(i1_ref_idx >= 0);
823
824
1.25M
                apu1_hpel_ref[1][0] =
825
1.25M
                    (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
826
1.25M
                                   ->s_yuv_buf_desc.pv_y_buf) +
827
1.25M
                    ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
828
1.25M
                apu1_hpel_ref[1][1] =
829
1.25M
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
830
1.25M
                    ref_offset;
831
1.25M
                apu1_hpel_ref[1][2] =
832
1.25M
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
833
1.25M
                    ref_offset;
834
1.25M
                apu1_hpel_ref[1][3] =
835
1.25M
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
836
1.25M
                    ref_offset;
837
838
1.25M
                luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
839
1.25M
                                       ->s_weight_offset.i2_luma_weight;
840
1.25M
                luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
841
1.25M
                                       ->s_weight_offset.i2_luma_offset;
842
1.25M
            }
843
844
1.81M
            if(aps_mv[1]->i2_mvx == INTRA_MV)
845
0
            {
846
0
                uni_cost = ps_pu_node1->i4_tot_cost;
847
0
                cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
848
849
0
                if(cur_iter_best_cost < bi_cost)
850
0
                {
851
0
                    bi_cost = cur_iter_best_cost;
852
0
                    best_cand_in_opp_dir_idx = i;
853
0
                    is_best_cand_an_intra = 1;
854
0
                }
855
856
0
                best_cost = MIN(uni_cost, bi_cost);
857
0
                tot_cost += best_cost;
858
0
                continue;
859
0
            }
860
861
1.81M
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
862
1.81M
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
863
864
1.81M
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
865
1.81M
                ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
866
867
1.81M
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
868
1.74M
            {
869
1.74M
                as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
870
1.74M
                as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
871
1.74M
                as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
872
1.74M
            }
873
874
1.81M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
875
0
            {
876
0
                hme_compute_sigmaX_and_sigmaXSquared(
877
0
                    as_pred_buf_data[0][j].pu1_pred,
878
0
                    as_pred_buf_data[0][j].i4_pred_stride,
879
0
                    &au8_sigmaX[0][j],
880
0
                    &au8_sigmaXSquared[0][j],
881
0
                    ps_interp_prms->i4_blk_wd,
882
0
                    ps_interp_prms->i4_blk_ht,
883
0
                    ps_interp_prms->i4_blk_wd,
884
0
                    ps_interp_prms->i4_blk_ht,
885
0
                    0,
886
0
                    1);
887
0
            }
888
889
1.81M
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
890
1.81M
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
891
892
1.81M
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
893
1.81M
                ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
894
895
1.81M
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
896
1.74M
            {
897
1.74M
                as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
898
1.74M
                as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
899
1.74M
                as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
900
1.74M
            }
901
902
1.81M
            ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
903
1.81M
                as_pred_buf_data[0][j].pu1_pred,
904
1.81M
                as_pred_buf_data[1][j].pu1_pred,
905
1.81M
                as_pred_buf_data[0][j].i4_pred_stride,
906
1.81M
                as_pred_buf_data[1][j].i4_pred_stride,
907
1.81M
                wd,
908
1.81M
                ht,
909
1.81M
                as_pred_buf_data[2][j].pu1_pred,
910
1.81M
                as_pred_buf_data[2][j].i4_pred_stride,
911
1.81M
                luma_weight_ref1,
912
1.81M
                luma_weight_ref2,
913
1.81M
                luma_offset_ref1,
914
1.81M
                luma_offset_ref2,
915
1.81M
                ps_inter_ctb_prms->wpred_log_wdc);
916
917
1.81M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
918
0
            {
919
0
                hme_compute_sigmaX_and_sigmaXSquared(
920
0
                    as_pred_buf_data[2][j].pu1_pred,
921
0
                    as_pred_buf_data[2][j].i4_pred_stride,
922
0
                    &au8_sigmaX[1][j],
923
0
                    &au8_sigmaXSquared[1][j],
924
0
                    ps_interp_prms->i4_blk_wd,
925
0
                    ps_interp_prms->i4_blk_ht,
926
0
                    ps_interp_prms->i4_blk_wd,
927
0
                    ps_interp_prms->i4_blk_ht,
928
0
                    0,
929
0
                    1);
930
0
            }
931
932
1.81M
            s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
933
1.81M
            s_err_prms.i4_inp_stride = inp_stride;
934
1.81M
            s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
935
1.81M
            s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
936
1.81M
            s_err_prms.i4_grid_mask = 1;
937
1.81M
            s_err_prms.pi4_sad_grid = &i4_sad_grid;
938
1.81M
            s_err_prms.i4_blk_wd = wd;
939
1.81M
            s_err_prms.i4_blk_ht = ht;
940
1.81M
            s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
941
1.81M
            s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
942
943
1.81M
            if(ps_inter_ctb_prms->u1_use_satd)
944
1.68M
            {
945
1.68M
                pf_err_compute = compute_satd_8bit;
946
1.68M
            }
947
134k
            else
948
134k
            {
949
134k
                pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
950
134k
            }
951
952
1.81M
            pf_err_compute(&s_err_prms);
953
954
1.81M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
955
1.81M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
956
0
            {
957
0
                unsigned long u4_shift_val;
958
0
                ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
959
0
                ULWORD64 u8_temp_var, u8_temp_var1;
960
0
                S32 i4_bits_req;
961
962
0
                S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
963
964
0
                u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
965
0
                u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
966
967
0
                if(e_cu_size == CU_8x8)
968
0
                {
969
0
                    PART_ID_T e_part_id =
970
0
                        (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
971
972
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
973
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
974
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
975
0
                        &u8_src_variance,
976
0
                        i4_default_src_wt,
977
0
                        0,
978
0
                        ps_inter_ctb_prms->wpred_log_wdc,
979
0
                        e_part_id);
980
0
                }
981
0
                else
982
0
                {
983
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
984
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
985
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
986
0
                        &u8_src_variance,
987
0
                        i4_default_src_wt,
988
0
                        0,
989
0
                        ps_inter_ctb_prms->wpred_log_wdc,
990
0
                        e_part_id);
991
0
                }
992
993
0
                u8_pred_variance = u8_pred_variance >> u4_shift_val;
994
995
0
                GETRANGE64(i4_bits_req, u8_pred_variance);
996
997
0
                if(i4_bits_req > 27)
998
0
                {
999
0
                    u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1000
0
                    u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1001
0
                }
1002
1003
0
                if(u8_src_variance == u8_pred_variance)
1004
0
                {
1005
0
                    u8_temp_var = (1 << STIM_Q_FORMAT);
1006
0
                }
1007
0
                else
1008
0
                {
1009
0
                    u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1010
0
                    u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1011
0
                    u8_temp_var1 =
1012
0
                        (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1013
0
                    u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1014
0
                    u8_temp_var = (u8_temp_var / u8_temp_var1);
1015
0
                }
1016
1017
0
                i4_noise_term = (UWORD32)u8_temp_var;
1018
1019
0
                i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1020
1021
0
                ASSERT(i4_noise_term >= 0);
1022
1023
0
                u8_temp_var = i4_sad_grid;
1024
0
                u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1025
0
                u8_temp_var += (1 << ((i4_q_level)-1));
1026
0
                i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
1027
0
            }
1028
1.81M
#endif
1029
1030
1.81M
            cur_iter_best_cost = i4_sad_grid;
1031
1.81M
            cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
1032
1.81M
            cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
1033
1034
1.81M
            if(cur_iter_best_cost < bi_cost)
1035
1.81M
            {
1036
1.81M
                bi_cost = cur_iter_best_cost;
1037
1.81M
                best_cand_in_opp_dir_idx = i;
1038
1.81M
                is_best_cand_an_intra = 0;
1039
1.81M
            }
1040
1.81M
        }
1041
1042
6.98M
        uni_cost = ps_pu_node1->i4_tot_cost;
1043
1044
6.98M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
1045
6.98M
        if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1046
0
        {
1047
0
            unsigned long u4_shift_val;
1048
0
            ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
1049
0
            ULWORD64 u8_temp_var, u8_temp_var1;
1050
0
            S32 i4_bits_req;
1051
1052
0
            S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
1053
1054
0
            S08 i1_ref_idx =
1055
0
                (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1056
0
                    ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
1057
0
                    : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
1058
0
            S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
1059
1060
0
            u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
1061
0
            u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
1062
1063
0
            if(e_cu_size == CU_8x8)
1064
0
            {
1065
0
                PART_ID_T e_part_id =
1066
0
                    (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
1067
1068
0
                u4_shift_val = ihevce_calc_stim_injected_variance(
1069
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaX,
1070
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1071
0
                    &u8_src_variance,
1072
0
                    ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1073
0
                    ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1074
0
                    ps_inter_ctb_prms->wpred_log_wdc,
1075
0
                    e_part_id);
1076
0
            }
1077
0
            else
1078
0
            {
1079
0
                u4_shift_val = ihevce_calc_stim_injected_variance(
1080
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaX,
1081
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1082
0
                    &u8_src_variance,
1083
0
                    ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1084
0
                    ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1085
0
                    ps_inter_ctb_prms->wpred_log_wdc,
1086
0
                    e_part_id);
1087
0
            }
1088
1089
0
            u8_pred_variance = u8_pred_variance >> (u4_shift_val);
1090
1091
0
            GETRANGE64(i4_bits_req, u8_pred_variance);
1092
1093
0
            if(i4_bits_req > 27)
1094
0
            {
1095
0
                u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1096
0
                u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1097
0
            }
1098
1099
0
            if(u8_src_variance == u8_pred_variance)
1100
0
            {
1101
0
                u8_temp_var = (1 << STIM_Q_FORMAT);
1102
0
            }
1103
0
            else
1104
0
            {
1105
0
                u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1106
0
                u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1107
0
                u8_temp_var1 =
1108
0
                    (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1109
0
                u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1110
0
                u8_temp_var = (u8_temp_var / u8_temp_var1);
1111
0
            }
1112
1113
0
            i4_noise_term = (UWORD32)u8_temp_var;
1114
1115
0
            i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1116
1117
0
            ASSERT(i4_noise_term >= 0);
1118
1119
0
            u8_temp_var = i4_sad;
1120
0
            u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1121
0
            u8_temp_var += (1 << ((i4_q_level)-1));
1122
0
            i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
1123
1124
0
            uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
1125
1126
0
            pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
1127
0
            pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
1128
0
        }
1129
6.98M
#endif
1130
1131
6.98M
        if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
1132
52.0k
        {
1133
52.0k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1134
0
            {
1135
0
                pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
1136
0
                pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
1137
0
            }
1138
1139
52.0k
            if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1140
26.1k
            {
1141
26.1k
                ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1142
1143
26.1k
                if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1144
0
                {
1145
0
                    ps_pu_node1->pu.mv.i1_l1_ref_idx =
1146
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1147
0
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1148
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1149
0
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1150
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1151
0
                }
1152
26.1k
                else
1153
26.1k
                {
1154
26.1k
                    ps_pu_node1->pu.mv.i1_l1_ref_idx =
1155
26.1k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1156
26.1k
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1157
26.1k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1158
26.1k
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1159
26.1k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1160
26.1k
                }
1161
26.1k
            }
1162
25.8k
            else
1163
25.8k
            {
1164
25.8k
                ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1165
1166
25.8k
                if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1167
25.8k
                {
1168
25.8k
                    ps_pu_node1->pu.mv.i1_l0_ref_idx =
1169
25.8k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1170
25.8k
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1171
25.8k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1172
25.8k
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1173
25.8k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1174
25.8k
                }
1175
0
                else
1176
0
                {
1177
0
                    ps_pu_node1->pu.mv.i1_l0_ref_idx =
1178
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1179
0
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1180
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1181
0
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1182
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1183
0
                }
1184
25.8k
            }
1185
1186
52.0k
            ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
1187
52.0k
        }
1188
1189
6.98M
        best_cost = MIN(uni_cost, bi_cost);
1190
6.98M
        tot_cost += best_cost;
1191
6.98M
    }
1192
1193
5.56M
    hme_debrief_bipred_eval(
1194
5.56M
        ps_part_type_result,
1195
5.56M
        as_pred_buf_data,
1196
5.56M
        &ps_inter_ctb_prms->s_pred_buf_mngr,
1197
5.56M
        au1_pred_buf_array_indixes,
1198
5.56M
        ps_cmn_utils_optimised_function_list);
1199
1200
5.56M
    ps_part_type_result->i4_tot_cost = tot_cost;
1201
5.56M
}
1202
1203
WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
1204
    err_prms_t *ps_prms,
1205
    WORD32 lambda,
1206
    WORD32 lambda_q_shift,
1207
    WORD32 i4_frm_qstep,
1208
    me_func_selector_t *ps_func_selector)
1209
2.55M
{
1210
2.55M
    S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
1211
2.55M
    S32 i4_satd_8x8;
1212
2.55M
    S16 *pi2_had_out;
1213
2.55M
    S32 i4_tu_split_flag = 0;
1214
2.55M
    S32 i4_tu_early_cbf = 0;
1215
1216
2.55M
    S32 i4_early_cbf = 1;
1217
    //  S32 i4_i, i4_k;
1218
2.55M
    S32 i4_total_satd_cost = 0;
1219
2.55M
    S32 best_cost_tu_split;
1220
1221
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1222
2.55M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1223
2.55M
    S32 *api4_tu_split[HAD_32x32 + 1];
1224
2.55M
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1225
1226
2.55M
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1227
2.55M
    S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
1228
2.55M
    S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
1229
1230
2.55M
    U08 *pu1_inp = ps_prms->pu1_inp;
1231
2.55M
    U08 *pu1_ref = ps_prms->pu1_ref;
1232
1233
2.55M
    S32 inp_stride = ps_prms->i4_inp_stride;
1234
2.55M
    S32 ref_stride = ps_prms->i4_ref_stride;
1235
1236
    /* Initialize tu_split_cost to "0" */
1237
2.55M
    ps_prms->i4_tu_split_cost = 0;
1238
2.55M
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1239
1240
2.55M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1241
2.55M
    api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
1242
2.55M
    api4_satd_pu[HAD_16x16] = NULL;
1243
2.55M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1244
1245
2.55M
    api4_tu_split[HAD_4x4] = NULL;
1246
2.55M
    api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
1247
2.55M
    api4_tu_split[HAD_16x16] = NULL;
1248
2.55M
    api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1249
1250
2.55M
    api4_tu_early_cbf[HAD_4x4] = NULL;
1251
2.55M
    api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
1252
2.55M
    api4_tu_early_cbf[HAD_16x16] = NULL;
1253
2.55M
    api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1254
1255
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1256
1257
    /* Return value is merge of both best_stad_cost and tu_split_flags */
1258
2.55M
    best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
1259
2.55M
        pu1_inp,
1260
2.55M
        inp_stride,
1261
2.55M
        pu1_ref,
1262
2.55M
        ref_stride,
1263
2.55M
        pi2_had_out,
1264
2.55M
        8,
1265
2.55M
        api4_satd_pu,
1266
2.55M
        api4_tu_split,
1267
2.55M
        api4_tu_early_cbf,
1268
2.55M
        0,
1269
2.55M
        2,
1270
2.55M
        0,
1271
2.55M
        0,
1272
2.55M
        i4_frm_qstep,
1273
2.55M
        0,
1274
2.55M
        ps_prms->u1_max_tr_depth,
1275
2.55M
        ps_prms->u1_max_tr_size,
1276
2.55M
        &(ps_prms->i4_tu_split_cost),
1277
2.55M
        NULL);
1278
1279
    /* For SATD computation following TU size are assumed for a 8x8 CU */
1280
    /* 8 for 2Nx2N, 4 for Nx2N,2NxN                                    */
1281
1282
2.55M
    i4_total_satd_cost = best_cost_tu_split >> 2;
1283
1284
    /* Second last bit has the tu pslit flag */
1285
2.55M
    i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
1286
1287
    /* Last bit corrsponds to the Early CBF flag */
1288
2.55M
    i4_early_cbf = (best_cost_tu_split & 0x1);
1289
1290
    /* Update 8x8 SATDs */
1291
2.55M
    pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
1292
2.55M
    pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
1293
2.55M
    pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
1294
1295
2.55M
    return i4_total_satd_cost;
1296
2.55M
}
1297
//#endif
1298
/**
1299
********************************************************************************
1300
*  @fn     S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
1301
*
1302
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1303
*          of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1304
*
1305
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1306
*                 pointer to sad grid of each partitions
1307
*
1308
*  @return     None
1309
********************************************************************************
1310
*/
1311
1312
void hme_evalsatd_update_2_best_results_pt_pu_16x16(
1313
    err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1314
0
{
1315
0
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1316
0
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1317
0
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1318
0
    S32 i;
1319
0
    S16 ai2_8x8_had[256];
1320
0
    S16 *pi2_y0;
1321
0
    U08 *pu1_src, *pu1_pred;
1322
0
    S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1323
0
    S32 *ppi4_hsad;
1324
1325
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1326
0
    S32 *api4_satd_pu[HAD_32x32 + 1];
1327
0
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1328
1329
0
    U08 *pu1_inp = ps_prms->pu1_inp;
1330
0
    U08 *pu1_ref = ps_prms->pu1_ref;
1331
1332
0
    S32 inp_stride = ps_prms->i4_inp_stride;
1333
0
    S32 ref_stride = ps_prms->i4_ref_stride;
1334
1335
0
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1336
0
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1337
0
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1338
0
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1339
1340
0
    ppi4_hsad = api4_satd_pu[HAD_16x16];
1341
1342
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1343
0
    for(i = 0; i < 4; i++)
1344
0
    {
1345
0
        pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1346
0
        pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1347
0
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1348
0
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1349
1350
0
        ihevce_had_8x8_using_4_4x4(
1351
0
            pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1352
0
    }
1353
1354
    /* For SATD computation following TU size are assumed for a 16x16 CU */
1355
    /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1356
1357
    /* Update 8x8 SATDs */
1358
    /* Modified to cost calculation using only 4x4 SATD */
1359
1360
    //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1361
    //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1362
    //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1363
    //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1364
1365
    /* Update 16x16 SATDs */
1366
0
    pi4_sad_grid[PART_ID_2Nx2N] =
1367
0
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1368
1369
0
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1370
0
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1371
0
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1372
0
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1373
1374
    /* Update 8x16 / 16x8 SATDs */
1375
0
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1376
0
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1377
0
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1378
0
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1379
1380
    /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1381
0
    pi4_sad_grid[PART_ID_nLx2N_L] =
1382
0
        ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
1383
1384
0
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
1385
0
                                    ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1386
1387
0
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
1388
0
                                    ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1389
1390
0
    pi4_sad_grid[PART_ID_nRx2N_R] =
1391
0
        ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
1392
1393
0
    pi4_sad_grid[PART_ID_2NxnU_T] =
1394
0
        ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
1395
1396
0
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
1397
0
                                    ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
1398
1399
0
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
1400
0
                                    ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
1401
1402
0
    pi4_sad_grid[PART_ID_2NxnD_B] =
1403
0
        ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1404
1405
    /* Call the update results function */
1406
0
    {
1407
0
        S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1408
0
        mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1409
0
        S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1410
0
        S32 best_node_cost;
1411
0
        S32 second_best_node_cost;
1412
1413
        /*For each valid partition, update the refine_prm structure to reflect the best and second
1414
        best candidates for that partition*/
1415
1416
0
        for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1417
0
        {
1418
0
            S32 update_required = 0;
1419
0
            S32 part_id = pi4_valid_part_ids[i4_count];
1420
0
            S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1421
1422
            /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1423
0
            i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1424
1425
            /*Calculate total cost*/
1426
0
            i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1427
0
            i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1428
1429
            /*****************************************************************/
1430
            /* We do not labor through the results if the total cost worse   */
1431
            /* than the last of the results.                                 */
1432
            /*****************************************************************/
1433
0
            best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1434
0
            second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
1435
1436
0
            if(i4_tot_cost < second_best_node_cost)
1437
0
            {
1438
0
                update_required = 2;
1439
1440
                /*************************************************************/
1441
                /* Identify where the current result isto be placed.Basically*/
1442
                /* find the node which has cost just higher thannodeundertest*/
1443
                /*************************************************************/
1444
0
                if(i4_tot_cost < best_node_cost)
1445
0
                {
1446
0
                    update_required = 1;
1447
0
                }
1448
0
                else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1449
0
                {
1450
0
                    update_required = 0;
1451
0
                }
1452
0
                if(update_required == 2)
1453
0
                {
1454
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1455
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1456
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1457
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1458
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1459
0
                }
1460
0
                else if(update_required == 1)
1461
0
                {
1462
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
1463
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index];
1464
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
1465
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1466
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] =
1467
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index];
1468
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] =
1469
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index];
1470
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
1471
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index];
1472
1473
0
                    ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1474
0
                    ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1475
0
                    ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1476
0
                    ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1477
0
                    ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1478
0
                }
1479
0
            }
1480
0
        }
1481
0
    }
1482
0
}
1483
1484
//#if COMPUTE_16x16_R == C
1485
void hme_evalsatd_update_1_best_result_pt_pu_16x16(
1486
    err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1487
17.2M
{
1488
17.2M
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1489
17.2M
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1490
17.2M
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1491
17.2M
    S32 i;
1492
17.2M
    S16 ai2_8x8_had[256];
1493
17.2M
    S16 *pi2_y0;
1494
17.2M
    U08 *pu1_src, *pu1_pred;
1495
17.2M
    S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1496
17.2M
    S32 *ppi4_hsad;
1497
1498
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1499
17.2M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1500
17.2M
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1501
1502
17.2M
    U08 *pu1_inp = ps_prms->pu1_inp;
1503
17.2M
    U08 *pu1_ref = ps_prms->pu1_ref;
1504
1505
17.2M
    S32 inp_stride = ps_prms->i4_inp_stride;
1506
17.2M
    S32 ref_stride = ps_prms->i4_ref_stride;
1507
1508
17.2M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1509
17.2M
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1510
17.2M
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1511
17.2M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1512
1513
17.2M
    ppi4_hsad = api4_satd_pu[HAD_16x16];
1514
1515
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1516
86.4M
    for(i = 0; i < 4; i++)
1517
69.1M
    {
1518
69.1M
        pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1519
69.1M
        pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1520
69.1M
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1521
69.1M
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1522
1523
69.1M
        ihevce_had_8x8_using_4_4x4(
1524
69.1M
            pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1525
69.1M
    }
1526
1527
    /* For SATD computation following TU size are assumed for a 16x16 CU */
1528
    /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1529
1530
    /* Update 8x8 SATDs */
1531
    /* Modified to cost calculation using only 4x4 SATD */
1532
1533
    //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1534
    //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1535
    //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1536
    //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1537
1538
    /* Update 16x16 SATDs */
1539
17.2M
    pi4_sad_grid[PART_ID_2Nx2N] =
1540
17.2M
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1541
1542
17.2M
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1543
17.2M
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1544
17.2M
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1545
17.2M
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1546
1547
    /* Update 8x16 / 16x8 SATDs */
1548
17.2M
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1549
17.2M
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1550
17.2M
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1551
17.2M
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1552
1553
    /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1554
17.2M
    pi4_sad_grid[PART_ID_nLx2N_L] =
1555
17.2M
        ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
1556
17.2M
    pi4_sad_grid[PART_ID_nRx2N_R] =
1557
17.2M
        ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
1558
17.2M
    pi4_sad_grid[PART_ID_2NxnU_T] =
1559
17.2M
        ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1560
17.2M
    pi4_sad_grid[PART_ID_2NxnD_B] =
1561
17.2M
        ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1562
1563
17.2M
    pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
1564
17.2M
    pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
1565
17.2M
    pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
1566
17.2M
    pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
1567
1568
    /* Call the update results function */
1569
17.2M
    {
1570
17.2M
        S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1571
17.2M
        mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1572
17.2M
        S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1573
17.2M
        S32 best_node_cost;
1574
17.2M
        S32 second_best_node_cost;
1575
1576
        /*For each valid partition, update the refine_prm structure to reflect the best and second
1577
        best candidates for that partition*/
1578
1579
257M
        for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1580
240M
        {
1581
240M
            S32 update_required = 0;
1582
240M
            S32 part_id = pi4_valid_part_ids[i4_count];
1583
240M
            S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1584
1585
            /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1586
240M
            i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1587
1588
            /*Calculate total cost*/
1589
240M
            i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1590
240M
            i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1591
1592
            /*****************************************************************/
1593
            /* We do not labor through the results if the total cost worse   */
1594
            /* than the last of the results.                                 */
1595
            /*****************************************************************/
1596
240M
            best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1597
240M
            second_best_node_cost = SHRT_MAX;
1598
1599
240M
            if(i4_tot_cost < second_best_node_cost)
1600
240M
            {
1601
240M
                update_required = 0;
1602
1603
                /*************************************************************/
1604
                /* Identify where the current result isto be placed.Basically*/
1605
                /* find the node which has cost just higher thannodeundertest*/
1606
                /*************************************************************/
1607
240M
                if(i4_tot_cost < best_node_cost)
1608
6.38M
                {
1609
6.38M
                    update_required = 1;
1610
6.38M
                }
1611
233M
                else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1612
163M
                {
1613
163M
                    update_required = 0;
1614
163M
                }
1615
240M
                if(update_required == 2)
1616
0
                {
1617
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1618
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1619
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1620
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1621
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1622
0
                }
1623
240M
                else if(update_required == 1)
1624
6.38M
                {
1625
6.38M
                    ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1626
6.38M
                    ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1627
6.38M
                    ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1628
6.38M
                    ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1629
6.38M
                    ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1630
6.38M
                }
1631
240M
            }
1632
240M
        }
1633
17.2M
    }
1634
17.2M
}
1635
1636
WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
1637
    err_prms_t *ps_prms,
1638
    WORD32 lambda,
1639
    WORD32 lambda_q_shift,
1640
    WORD32 i4_frm_qstep,
1641
    me_func_selector_t *ps_func_selector)
1642
2.26M
{
1643
2.26M
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1644
2.26M
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1645
2.26M
    S32 ai4_tu_split_8x8[16];
1646
2.26M
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1647
1648
2.26M
    S32 ai4_tu_early_cbf_8x8[16];
1649
1650
    //S16 ai2_had_out[256];
1651
2.26M
    S16 *pi2_had_out;
1652
2.26M
    S32 tu_split_flag = 0;
1653
2.26M
    S32 early_cbf_flag = 0;
1654
2.26M
    S32 total_satd_cost = 0;
1655
1656
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1657
2.26M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1658
2.26M
    S32 *api4_tu_split[HAD_32x32 + 1];
1659
2.26M
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1660
1661
2.26M
    U08 *pu1_inp = ps_prms->pu1_inp;
1662
2.26M
    U08 *pu1_ref = ps_prms->pu1_ref;
1663
1664
2.26M
    S32 inp_stride = ps_prms->i4_inp_stride;
1665
2.26M
    S32 ref_stride = ps_prms->i4_ref_stride;
1666
1667
    /* Initialize tu_split_cost to "0" */
1668
2.26M
    ps_prms->i4_tu_split_cost = 0;
1669
1670
2.26M
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1671
1672
2.26M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1673
2.26M
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1674
2.26M
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1675
2.26M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1676
1677
2.26M
    api4_tu_split[HAD_4x4] = NULL;
1678
2.26M
    api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1679
2.26M
    api4_tu_split[HAD_16x16] = &tu_split_flag;
1680
2.26M
    api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1681
1682
2.26M
    api4_tu_early_cbf[HAD_4x4] = NULL;
1683
2.26M
    api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1684
2.26M
    api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
1685
2.26M
    api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1686
1687
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1688
2.26M
    ps_func_selector->pf_had_16x16_r(
1689
2.26M
        pu1_inp,
1690
2.26M
        inp_stride,
1691
2.26M
        pu1_ref,
1692
2.26M
        ref_stride,
1693
2.26M
        pi2_had_out,
1694
2.26M
        16,
1695
2.26M
        api4_satd_pu,
1696
2.26M
        api4_tu_split,
1697
2.26M
        api4_tu_early_cbf,
1698
2.26M
        0,
1699
2.26M
        4,
1700
2.26M
        lambda,
1701
2.26M
        lambda_q_shift,
1702
2.26M
        i4_frm_qstep,
1703
2.26M
        0,
1704
2.26M
        ps_prms->u1_max_tr_depth,
1705
2.26M
        ps_prms->u1_max_tr_size,
1706
2.26M
        &(ps_prms->i4_tu_split_cost),
1707
2.26M
        NULL);
1708
1709
2.26M
    total_satd_cost = i4_satd_16x16;
1710
1711
2.26M
    ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
1712
1713
2.26M
    ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
1714
1715
2.26M
    return total_satd_cost;
1716
2.26M
}
1717
1718
/**
1719
********************************************************************************
1720
*  @fn     S32 hme_evalsatd_pt_pu_32x32
1721
*
1722
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1723
*          of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1724
*
1725
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1726
*                 pointer to sad grid of each partitions
1727
*
1728
*  @return     None
1729
********************************************************************************
1730
*/
1731
void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
1732
363k
{
1733
    //S32 ai4_satd_4x4[64];   /* num 4x4s in a 32x32 */
1734
363k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1735
363k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1736
363k
    S32 i4_satd_32x32;
1737
    //    S16 ai2_had_out[32*32];
1738
363k
    U08 *pu1_src;
1739
363k
    U08 *pu1_pred;
1740
363k
    S32 i;
1741
1742
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1743
363k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1744
363k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1745
1746
363k
    U08 *pu1_inp = ps_prms->pu1_inp;
1747
363k
    U08 *pu1_ref = ps_prms->pu1_ref;
1748
1749
363k
    S32 inp_stride = ps_prms->i4_inp_stride;
1750
363k
    S32 ref_stride = ps_prms->i4_ref_stride;
1751
1752
    //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[0];
1753
363k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1754
363k
    api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1755
363k
    api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1756
1757
    /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
1758
6.17M
    for(i = 0; i < 16; i++)
1759
5.81M
    {
1760
5.81M
        pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
1761
1762
5.81M
        pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
1763
1764
5.81M
        ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1765
5.81M
            pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1766
5.81M
    }
1767
1768
    /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1769
363k
    ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
1770
363k
    ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
1771
363k
    ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
1772
363k
    ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1773
1774
    /* Update 32x32 SATD */
1775
363k
    pi4_sad_grid[PART_ID_2Nx2N] =
1776
363k
        ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
1777
1778
    /* Update 16x16 SATDs */
1779
363k
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
1780
363k
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
1781
363k
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
1782
363k
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
1783
1784
    /* Update 16x32 / 32x16 SATDs */
1785
363k
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
1786
363k
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
1787
363k
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
1788
363k
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
1789
1790
    /* Update AMP SATDs 32x24,32x8, 24x32,8x32  */
1791
363k
    pi4_sad_grid[PART_ID_nLx2N_L] =
1792
363k
        ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
1793
1794
363k
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
1795
363k
                                    ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1796
1797
363k
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
1798
363k
                                    ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1799
1800
363k
    pi4_sad_grid[PART_ID_nRx2N_R] =
1801
363k
        ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
1802
1803
363k
    pi4_sad_grid[PART_ID_2NxnU_T] =
1804
363k
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1805
1806
363k
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
1807
363k
                                    ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
1808
1809
363k
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
1810
363k
                                    ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
1811
1812
363k
    pi4_sad_grid[PART_ID_2NxnD_B] =
1813
363k
        ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1814
363k
}
1815
1816
WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
1817
    err_prms_t *ps_prms,
1818
    WORD32 lambda,
1819
    WORD32 lambda_q_shift,
1820
    WORD32 i4_frm_qstep,
1821
    me_func_selector_t *ps_func_selector)
1822
682k
{
1823
682k
    S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1824
682k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1825
682k
    S32 ai4_tu_split_8x8[16];
1826
682k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1827
682k
    S32 ai4_tu_split_16x16[4];
1828
682k
    S32 i4_satd_32x32;
1829
1830
682k
    S32 ai4_tu_early_cbf_8x8[16];
1831
682k
    S32 ai4_tu_early_cbf_16x16[4];
1832
682k
    S32 early_cbf_flag;
1833
1834
682k
    S16 *pi2_had_out;
1835
1836
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1837
682k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1838
682k
    S32 *api4_tu_split[HAD_32x32 + 1];
1839
682k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1840
1841
682k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1842
682k
    S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
1843
682k
    S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
1844
1845
682k
    S32 tu_split_flag = 0;
1846
682k
    S32 total_satd_cost = 0;
1847
1848
682k
    U08 *pu1_inp = ps_prms->pu1_inp;
1849
682k
    U08 *pu1_ref = ps_prms->pu1_ref;
1850
1851
682k
    S32 inp_stride = ps_prms->i4_inp_stride;
1852
682k
    S32 ref_stride = ps_prms->i4_ref_stride;
1853
1854
    /* Initialize tu_split_cost to "0" */
1855
682k
    ps_prms->i4_tu_split_cost = 0;
1856
1857
682k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1858
1859
682k
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1860
682k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1861
682k
    api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1862
682k
    api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1863
1864
682k
    api4_tu_split[HAD_4x4] = NULL;
1865
682k
    api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1866
682k
    api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
1867
682k
    api4_tu_split[HAD_32x32] = &tu_split_flag;
1868
1869
682k
    api4_tu_early_cbf[HAD_4x4] = NULL;
1870
682k
    api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1871
682k
    api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
1872
682k
    api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
1873
1874
    /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
1875
682k
    ihevce_had_32x32_r(
1876
682k
        pu1_inp,
1877
682k
        inp_stride,
1878
682k
        pu1_ref,
1879
682k
        ref_stride,
1880
682k
        pi2_had_out,
1881
682k
        32,
1882
682k
        api4_satd_pu,
1883
682k
        api4_tu_split,
1884
682k
        api4_tu_early_cbf,
1885
682k
        0,
1886
682k
        8,
1887
682k
        lambda,
1888
682k
        lambda_q_shift,
1889
682k
        i4_frm_qstep,
1890
682k
        0,
1891
682k
        ps_prms->u1_max_tr_depth,
1892
682k
        ps_prms->u1_max_tr_size,
1893
682k
        &(ps_prms->i4_tu_split_cost),
1894
682k
        ps_func_selector);
1895
1896
682k
    total_satd_cost = i4_satd_32x32;
1897
1898
    /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
1899
    TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1900
    TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1901
    BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1902
    BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1903
    32x32_split - 1bit (LSBit)
1904
1905
    TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
1906
1907
682k
    pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
1908
682k
    pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
1909
682k
    pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
1910
1911
682k
    return total_satd_cost;
1912
682k
}
1913
1914
/**
1915
********************************************************************************
1916
*  @fn     S32 hme_evalsatd_pt_pu_64x64
1917
*
1918
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1919
*          of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
1920
*
1921
*           Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
1922
*                  outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
1923
*                  TU size of 64 is not supported in HEVC
1924
*
1925
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1926
*                 pointer to sad grid of each partitions
1927
*
1928
*  @return     None
1929
********************************************************************************
1930
*/
1931
1932
void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
1933
53.8k
{
1934
    //S32 ai4_satd_4x4[4][64];   /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
1935
53.8k
    S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
1936
53.8k
    S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
1937
53.8k
    S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
1938
    //    S16 ai2_had_out[32*32];
1939
53.8k
    S32 i, j;
1940
1941
    //  S32 ai4_tu_split_8x8[4][16];
1942
    //  S32 ai4_tu_split_16x16[4][4];
1943
    //  S32 ai4_tu_split_32x32[4];
1944
1945
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1946
53.8k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1947
    //  S32 *api4_tu_split[HAD_32x32 + 1];
1948
1949
53.8k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1950
1951
53.8k
    U08 *pu1_inp = ps_prms->pu1_inp;
1952
53.8k
    U08 *pu1_ref = ps_prms->pu1_ref;
1953
53.8k
    U08 *pu1_src;
1954
53.8k
    U08 *pu1_pred;
1955
1956
53.8k
    S32 inp_stride = ps_prms->i4_inp_stride;
1957
53.8k
    S32 ref_stride = ps_prms->i4_ref_stride;
1958
1959
269k
    for(i = 0; i < 4; i++)
1960
215k
    {
1961
215k
        S32 blkx = (i & 0x1);
1962
215k
        S32 blky = (i >> 1);
1963
215k
        U08 *pu1_pi0, *pu1_pi1;
1964
1965
        //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[i][0];
1966
215k
        api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
1967
215k
        api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
1968
215k
        api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
1969
1970
215k
        pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
1971
215k
        pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
1972
1973
        /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
1974
3.65M
        for(j = 0; j < 16; j++)
1975
3.44M
        {
1976
3.44M
            pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
1977
1978
3.44M
            pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
1979
1980
3.44M
            ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1981
3.44M
                pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1982
3.44M
        }
1983
1984
        /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1985
215k
        ai4_satd_16x16[i][0] =
1986
215k
            ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
1987
215k
        ai4_satd_16x16[i][1] =
1988
215k
            ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
1989
215k
        ai4_satd_16x16[i][2] =
1990
215k
            ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
1991
215k
        ai4_satd_16x16[i][3] =
1992
215k
            ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
1993
215k
    }
1994
1995
    /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1996
1997
53.8k
    ai4_satd_32x32[0] =
1998
53.8k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
1999
53.8k
    ai4_satd_32x32[1] =
2000
53.8k
        ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
2001
53.8k
    ai4_satd_32x32[2] =
2002
53.8k
        ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
2003
53.8k
    ai4_satd_32x32[3] =
2004
53.8k
        ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2005
2006
    /* Update 64x64 SATDs */
2007
53.8k
    pi4_sad_grid[PART_ID_2Nx2N] =
2008
53.8k
        ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2009
2010
    /* Update 32x32 SATDs */
2011
53.8k
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
2012
53.8k
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
2013
53.8k
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
2014
53.8k
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
2015
2016
    /* Update 32x64 / 64x32 SATDs */
2017
53.8k
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
2018
53.8k
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
2019
53.8k
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
2020
53.8k
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
2021
2022
    /* Update AMP SATDs 64x48,64x16, 48x64,16x64  */
2023
53.8k
    pi4_sad_grid[PART_ID_nLx2N_L] =
2024
53.8k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
2025
2026
53.8k
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
2027
53.8k
                                    ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
2028
53.8k
                                    pi4_sad_grid[PART_ID_Nx2N_R];
2029
2030
53.8k
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
2031
53.8k
                                    ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
2032
53.8k
                                    pi4_sad_grid[PART_ID_Nx2N_L];
2033
2034
53.8k
    pi4_sad_grid[PART_ID_nRx2N_R] =
2035
53.8k
        ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
2036
2037
53.8k
    pi4_sad_grid[PART_ID_2NxnU_T] =
2038
53.8k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
2039
2040
53.8k
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
2041
53.8k
                                    ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
2042
53.8k
                                    pi4_sad_grid[PART_ID_2NxN_B];
2043
2044
53.8k
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
2045
53.8k
                                    ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
2046
53.8k
                                    pi4_sad_grid[PART_ID_2NxN_T];
2047
2048
53.8k
    pi4_sad_grid[PART_ID_2NxnD_B] =
2049
53.8k
        ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2050
53.8k
}
2051
2052
WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
2053
    err_prms_t *ps_prms,
2054
    WORD32 lambda,
2055
    WORD32 lambda_q_shift,
2056
    WORD32 i4_frm_qstep,
2057
    me_func_selector_t *ps_func_selector)
2058
61.8k
{
2059
61.8k
    S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
2060
61.8k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
2061
61.8k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
2062
61.8k
    S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
2063
2064
61.8k
    S32 ai4_tu_split_8x8[16];
2065
61.8k
    S32 ai4_tu_split_16x16[4];
2066
2067
61.8k
    S32 ai4_tu_early_cbf_8x8[16];
2068
61.8k
    S32 ai4_tu_early_cbf_16x16[4];
2069
2070
61.8k
    S16 *pi2_had_out;
2071
61.8k
    S32 i;
2072
2073
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
2074
61.8k
    S32 *api4_satd_pu[HAD_32x32 + 1];
2075
61.8k
    S32 *api4_tu_split[HAD_32x32 + 1];
2076
61.8k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
2077
2078
61.8k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
2079
2080
61.8k
    S32 tu_split_flag = 0;
2081
61.8k
    S32 total_satd_cost = 0;
2082
2083
61.8k
    U08 *pu1_inp = ps_prms->pu1_inp;
2084
61.8k
    U08 *pu1_ref = ps_prms->pu1_ref;
2085
2086
61.8k
    S32 inp_stride = ps_prms->i4_inp_stride;
2087
61.8k
    S32 ref_stride = ps_prms->i4_ref_stride;
2088
2089
    /* Initialize tu_split_cost to "0" */
2090
61.8k
    ps_prms->i4_tu_split_cost = 0;
2091
2092
61.8k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
2093
2094
309k
    for(i = 0; i < 4; i++)
2095
247k
    {
2096
247k
        S32 blkx = (i & 0x1);
2097
247k
        S32 blky = (i >> 1);
2098
247k
        U08 *pu1_pi0, *pu1_pi1;
2099
247k
        tu_split_flag = 0;
2100
2101
247k
        api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
2102
247k
        api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
2103
247k
        api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
2104
247k
        api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
2105
2106
247k
        api4_tu_split[HAD_4x4] = NULL;
2107
247k
        api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
2108
247k
        api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
2109
247k
        api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
2110
2111
247k
        api4_tu_early_cbf[HAD_4x4] = NULL;
2112
247k
        api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
2113
247k
        api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
2114
247k
        api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
2115
2116
247k
        pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
2117
247k
        pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
2118
2119
        /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
2120
247k
        ihevce_had_32x32_r(
2121
247k
            pu1_pi0,
2122
247k
            inp_stride,
2123
247k
            pu1_pi1,
2124
247k
            ref_stride,
2125
247k
            pi2_had_out,
2126
247k
            32,
2127
247k
            api4_satd_pu,
2128
247k
            api4_tu_split,
2129
247k
            api4_tu_early_cbf,
2130
247k
            0,
2131
247k
            8,
2132
247k
            lambda,
2133
247k
            lambda_q_shift,
2134
247k
            i4_frm_qstep,
2135
247k
            1,
2136
247k
            ps_prms->u1_max_tr_depth,
2137
247k
            ps_prms->u1_max_tr_size,
2138
247k
            &(ps_prms->i4_tu_split_cost),
2139
247k
            ps_func_selector);
2140
247k
    }
2141
2142
61.8k
    total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2143
2144
    /* Update 64x64 SATDs */
2145
61.8k
    pi4_sad_grid[PART_ID_2Nx2N] =
2146
61.8k
        ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2147
2148
61.8k
    return total_satd_cost;
2149
61.8k
}
2150
2151
/**
2152
********************************************************************************
2153
*  @fn     void hme_subpel_refine_search_node(search_node_t *ps_search_node,
2154
*                                   hme_subpel_prms_t *ps_prms,
2155
*                                   layer_ctxt_t *ps_curr_layer,
2156
*                                   BLK_SIZE_T e_blk_size,
2157
*                                   S32 x_off,
2158
*                                   S32 y_off)
2159
*
2160
*  @brief  Refines a given partition within a CU
2161
*
2162
*  @param[in,out]  ps_search_node: supplies starting mv and also ref id.
2163
*                   updated with the accurate subpel mv
2164
*
2165
*  @param[in]  ps_prms: subpel prms input to this function
2166
*
2167
*  @param[in]  ps_curr_layer : layer context
2168
*
2169
*  @param[in]  e_blk_size : Block size enumeration
2170
*
2171
*  @param[in]  x_off : x offset of the partition w.r.t. pic start
2172
*
2173
*  @param[in]  y_off : y offset of the partition w.r.t. pic start
2174
*
2175
*  @return None
2176
********************************************************************************
2177
*/
2178
2179
static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
2180
    me_func_selector_t *ps_func_selector,
2181
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
2182
    S32 i4_part_mask,
2183
    U08 u1_use_satd,
2184
    U08 u1_num_parts,
2185
    U08 u1_num_results)
2186
2.62M
{
2187
2.62M
    PF_SAD_RESULT_FXN_T pf_err_compute;
2188
2189
2.62M
    ASSERT((1 == u1_num_results) || (2 == u1_num_results));
2190
2191
2.62M
    if(1 == u1_num_results)
2192
2.62M
    {
2193
2.62M
        if(u1_use_satd)
2194
1.71M
        {
2195
1.71M
            if(u1_num_parts == 1)
2196
276k
            {
2197
276k
                pf_err_compute =
2198
276k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
2199
276k
            }
2200
1.43M
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2201
66.6k
            {
2202
66.6k
                pf_err_compute =
2203
66.6k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
2204
66.6k
            }
2205
1.37M
            else
2206
1.37M
            {
2207
1.37M
                pf_err_compute =
2208
1.37M
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
2209
1.37M
            }
2210
1.71M
        }
2211
907k
        else
2212
907k
        {
2213
907k
            if(u1_num_parts == 1)
2214
836k
            {
2215
836k
                pf_err_compute = ps_me_optimised_function_list
2216
836k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
2217
836k
            }
2218
70.6k
            else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2219
12.2k
            {
2220
12.2k
                pf_err_compute =
2221
12.2k
                    ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
2222
12.2k
            }
2223
58.4k
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2224
40.9k
            {
2225
40.9k
                pf_err_compute = ps_me_optimised_function_list
2226
40.9k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
2227
40.9k
            }
2228
17.4k
            else
2229
17.4k
            {
2230
17.4k
                pf_err_compute = ps_me_optimised_function_list
2231
17.4k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
2232
17.4k
            }
2233
907k
        }
2234
2.62M
    }
2235
0
    else
2236
0
    {
2237
0
        if(u1_use_satd)
2238
0
        {
2239
0
            if(u1_num_parts == 1)
2240
0
            {
2241
0
                pf_err_compute =
2242
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
2243
0
            }
2244
0
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2245
0
            {
2246
0
                pf_err_compute =
2247
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
2248
0
            }
2249
0
            else
2250
0
            {
2251
0
                pf_err_compute =
2252
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
2253
0
            }
2254
0
        }
2255
0
        else
2256
0
        {
2257
0
            if(u1_num_parts == 1)
2258
0
            {
2259
0
                pf_err_compute = ps_me_optimised_function_list
2260
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
2261
0
            }
2262
0
            else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2263
0
            {
2264
0
                pf_err_compute = ps_me_optimised_function_list
2265
0
                                     ->pf_calc_sad_and_2_best_results_subpel_square_parts;
2266
0
            }
2267
0
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2268
0
            {
2269
0
                pf_err_compute = ps_me_optimised_function_list
2270
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
2271
0
            }
2272
0
            else
2273
0
            {
2274
0
                pf_err_compute = ps_me_optimised_function_list
2275
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
2276
0
            }
2277
0
        }
2278
0
    }
2279
2280
2.62M
    return pf_err_compute;
2281
2.62M
}
2282
2283
#if DIAMOND_GRID == 1
2284
S32 hme_subpel_refine_search_node_high_speed(
2285
    search_node_t *ps_search_node,
2286
    hme_subpel_prms_t *ps_prms,
2287
    layer_ctxt_t *ps_curr_layer,
2288
    BLK_SIZE_T e_blk_size,
2289
    S32 x_off,
2290
    S32 y_off,
2291
    search_results_t *ps_search_results,
2292
    S32 pred_lx,
2293
    S32 i4_part_mask,
2294
    S32 *pi4_valid_part_ids,
2295
    S32 search_idx,
2296
    subpel_dedup_enabler_t *ps_dedup_enabler,
2297
    me_func_selector_t *ps_func_selector,
2298
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
2299
2.62M
{
2300
2.62M
    S32 i4_num_hpel_refine, i4_num_qpel_refine;
2301
2.62M
    S32 i4_offset, i4_grid_mask;
2302
2.62M
    S08 i1_ref_idx;
2303
2.62M
    S32 i4_blk_wd, i4_blk_ht;
2304
2.62M
    S32 i4_ref_stride, i4_i;
2305
2.62M
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2306
2.62M
    result_upd_prms_t s_result_prms;
2307
2.62M
    search_node_t s_temp_search_node;
2308
2309
    /*************************************************************************/
2310
    /* Tracks current MV with the fractional component.                      */
2311
    /*************************************************************************/
2312
2.62M
    S32 i4_mv_x, i4_mv_y;
2313
2.62M
    S32 i4_frac_x, i4_frac_y;
2314
2315
    /*************************************************************************/
2316
    /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2317
    /* This function                                                         */
2318
    /*************************************************************************/
2319
2.62M
    PF_SAD_RESULT_FXN_T pf_err_compute;
2320
2321
2.62M
    S32 ai4_sad_grid[17], i4_tot_cost;
2322
2.62M
    err_prms_t s_err_prms;
2323
2324
    /*************************************************************************/
2325
    /* Allowed MV RANGE                                                      */
2326
    /*************************************************************************/
2327
2.62M
    range_prms_t *ps_range_prms;
2328
2329
    /*************************************************************************/
2330
    /* stores min id in grid with associated min cost.                       */
2331
    /*************************************************************************/
2332
2.62M
    S32 i4_min_cost, i4_min_sad;
2333
2.62M
    GRID_PT_T e_min_id;
2334
2335
2.62M
    PF_INTERP_FXN_T pf_qpel_interp;
2336
    /*************************************************************************/
2337
    /* For hpel and qpel we move in diamonds and hence each point in the     */
2338
    /* diamond will belong to a completely different plane. To simplify the  */
2339
    /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2340
    /* hpel planes which are interpolated during recon.                      */
2341
    /*************************************************************************/
2342
2.62M
    U08 *apu1_hpel_ref[4], *pu1_ref;
2343
2344
2.62M
    interp_prms_t s_interp_prms;
2345
2346
    /*************************************************************************/
2347
    /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2348
    /* points to the corresponding predicted buf with its stride.            */
2349
    /* Note that the pointer cannot be derived just from the id, since the   */
2350
    /* pointer may also point to the hpel buffer (in case we request interp  */
2351
    /* of a hpel pt, which already exists in the recon hpel planes)          */
2352
    /*************************************************************************/
2353
2.62M
    U08 *pu1_final_out;
2354
2.62M
    S32 i4_final_out_stride;
2355
2.62M
    S32 part_id;
2356
2.62M
    S32 check_for_duplicate = 0;
2357
2358
2.62M
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
2359
2360
2.62M
    S32 mvx_qpel;
2361
2.62M
    S32 mvy_qpel;
2362
2363
2.62M
    pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
2364
2.62M
        ps_func_selector,
2365
2.62M
        ps_me_optimised_function_list,
2366
2.62M
        i4_part_mask,
2367
2.62M
        ps_prms->i4_use_satd,
2368
2.62M
        ps_subpel_refine_ctxt->i4_num_valid_parts,
2369
2.62M
        ps_search_results->u1_num_results_per_part);
2370
2371
2.62M
    i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2372
2.62M
    i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2373
2374
    /* Prediction contet should now deal with qpel units */
2375
2.62M
    HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2376
2377
    /* Buffer allocation for subpel */
2378
    /* Current design is that there may be many partitions and different mvs */
2379
    /* that attempt subpel refinemnt. While there is possibility of overlap, the */
2380
    /* hashing to detect and avoid overlap may be very complex. So, currently,   */
2381
    /* the only thing done is to store the eventual predicted buffer with every  */
2382
    /* ctb node that holds the result of hte best subpel search */
2383
2384
    /* Compute the base pointer for input, interpolated buffers */
2385
    /* The base pointers point as follows: */
2386
    /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
2387
    /* To these, we need to add the offset of the current node */
2388
2.62M
    i4_ref_stride = ps_curr_layer->i4_rec_stride;
2389
2.62M
    i4_offset = x_off + (y_off * i4_ref_stride);
2390
2.62M
    i1_ref_idx = ps_search_node->i1_ref_idx;
2391
2392
2.62M
    apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
2393
2.62M
    apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
2394
2.62M
    apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
2395
2.62M
    apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
2396
2397
    /* Initialize result params used for partition update */
2398
2.62M
    s_result_prms.pf_mv_cost_compute = NULL;
2399
2.62M
    s_result_prms.ps_search_results = ps_search_results;
2400
2.62M
    s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
2401
2.62M
    s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
2402
2.62M
    s_result_prms.u1_pred_lx = search_idx;
2403
2.62M
    s_result_prms.i4_part_mask = i4_part_mask;
2404
2.62M
    s_result_prms.ps_search_node_base = ps_search_node;
2405
2.62M
    s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
2406
2.62M
    s_result_prms.i4_grid_mask = 1;
2407
2.62M
    s_result_prms.ps_search_node = &s_temp_search_node;
2408
2.62M
    s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
2409
2410
    /* convert to hpel units */
2411
2.62M
    i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
2412
2.62M
    i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
2413
2414
    /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
2415
2.62M
    ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
2416
2.62M
    i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
2417
2.62M
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2418
2419
2.62M
    i4_min_cost = MAX_32BIT_VAL;
2420
2.62M
    i4_min_sad = MAX_32BIT_VAL;
2421
2422
    /*************************************************************************/
2423
    /* Prepare the input params to SAD/SATD function. Note that input is     */
2424
    /* passed from the calling funcion since it may be I (normal subpel      */
2425
    /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
2426
    /* Both cases are handled here.                                          */
2427
    /*************************************************************************/
2428
2.62M
    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
2429
2.62M
    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
2430
2.62M
    s_err_prms.i4_ref_stride = i4_ref_stride;
2431
2.62M
    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
2432
2.62M
    s_err_prms.i4_grid_mask = 1;
2433
2.62M
    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
2434
2.62M
    s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
2435
2.62M
    s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
2436
2437
2.62M
    s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
2438
2439
2.62M
    part_id = ps_search_node->u1_part_id;
2440
3.22M
    for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
2441
3.06M
    {
2442
3.06M
        e_min_id = PT_C;
2443
2444
3.06M
        mvx_qpel = i4_mv_x << 1;
2445
3.06M
        mvy_qpel = i4_mv_y << 1;
2446
2447
        /* Central pt */
2448
3.06M
        if(i4_grid_mask & BIT_EN(PT_C))
2449
2.62M
        {
2450
            //ps_search_node->i2_mv_x = (S16)i4_mv_x;
2451
            //ps_search_node->i2_mv_x = (S16)i4_mv_y;
2452
            /* central pt is i4_mv_x, i4_mv_y */
2453
2.62M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2454
2.62M
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
2455
2456
2.62M
            i4_frac_x = i4_mv_x & 1;
2457
2.62M
            i4_frac_y = i4_mv_y & 1;
2458
2.62M
            pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2459
2.62M
            s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2460
2461
            /* Update the mv's with the current candt motion vectors */
2462
2.62M
            s_result_prms.i2_mv_x = mvx_qpel;
2463
2.62M
            s_result_prms.i2_mv_y = mvy_qpel;
2464
2.62M
            s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2465
2.62M
            s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2466
2467
2.62M
            pf_err_compute(&s_err_prms, &s_result_prms);
2468
2469
2.62M
            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2470
2.62M
            if(i4_tot_cost < i4_min_cost)
2471
2.62M
            {
2472
2.62M
                i4_min_cost = i4_tot_cost;
2473
2.62M
                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2474
2.62M
                e_min_id = PT_C;
2475
2.62M
                pu1_final_out = s_err_prms.pu1_ref;
2476
2.62M
            }
2477
2.62M
        }
2478
2479
        /* left pt */
2480
3.06M
        if(i4_grid_mask & BIT_EN(PT_L))
2481
2.92M
        {
2482
2.92M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2483
2.92M
                ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
2484
2485
2.92M
            if(!check_for_duplicate)
2486
2.88M
            {
2487
                /* search node mv is stored in qpel units */
2488
2.88M
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
2489
2.88M
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2490
                /* central pt is i4_mv_x - 1, i4_mv_y */
2491
2.88M
                i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
2492
2.88M
                i4_frac_y = i4_mv_y & 1;
2493
2.88M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2494
2.88M
                s_err_prms.pu1_ref =
2495
2.88M
                    pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2496
2497
                /* Update the mv's with the current candt motion vectors */
2498
2.88M
                s_result_prms.i2_mv_x = mvx_qpel - 2;
2499
2.88M
                s_result_prms.i2_mv_y = mvy_qpel;
2500
2.88M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
2501
2.88M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2502
2503
2.88M
                pf_err_compute(&s_err_prms, &s_result_prms);
2504
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2505
2.88M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2506
2.88M
                if(i4_tot_cost < i4_min_cost)
2507
212k
                {
2508
212k
                    i4_min_cost = i4_tot_cost;
2509
212k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2510
212k
                    e_min_id = PT_L;
2511
212k
                    pu1_final_out = s_err_prms.pu1_ref;
2512
212k
                }
2513
2.88M
            }
2514
2.92M
        }
2515
        /* top pt */
2516
3.06M
        if(i4_grid_mask & BIT_EN(PT_T))
2517
2.94M
        {
2518
2.94M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2519
2.94M
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
2520
2521
2.94M
            if(!check_for_duplicate)
2522
2.90M
            {
2523
                /* search node mv is stored in qpel units */
2524
2.90M
                ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
2525
2.90M
                ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
2526
                /* top pt is i4_mv_x, i4_mv_y - 1 */
2527
2.90M
                i4_frac_x = i4_mv_x & 1;
2528
2.90M
                i4_frac_y = (i4_mv_y - 1) & 1;
2529
2.90M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2530
2.90M
                s_err_prms.pu1_ref =
2531
2.90M
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
2532
2533
                /* Update the mv's with the current candt motion vectors */
2534
2.90M
                s_result_prms.i2_mv_x = mvx_qpel;
2535
2.90M
                s_result_prms.i2_mv_y = mvy_qpel - 2;
2536
2.90M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2537
2.90M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
2538
2539
2.90M
                pf_err_compute(&s_err_prms, &s_result_prms);
2540
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2541
2.90M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2542
2.90M
                if(i4_tot_cost < i4_min_cost)
2543
178k
                {
2544
178k
                    i4_min_cost = i4_tot_cost;
2545
178k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2546
178k
                    e_min_id = PT_T;
2547
178k
                    pu1_final_out = s_err_prms.pu1_ref;
2548
178k
                }
2549
2.90M
            }
2550
2.94M
        }
2551
        /* right pt */
2552
3.06M
        if(i4_grid_mask & BIT_EN(PT_R))
2553
2.96M
        {
2554
2.96M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2555
2.96M
                ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
2556
2.96M
            if(!check_for_duplicate)
2557
2.91M
            {
2558
                /* search node mv is stored in qpel units */
2559
2.91M
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
2560
2.91M
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2561
                /* right pt is i4_mv_x + 1, i4_mv_y */
2562
2.91M
                i4_frac_x = (i4_mv_x + 1) & 1;
2563
2.91M
                i4_frac_y = i4_mv_y & 1;
2564
2565
2.91M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2566
2.91M
                s_err_prms.pu1_ref =
2567
2.91M
                    pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2568
2569
                /* Update the mv's with the current candt motion vectors */
2570
2.91M
                s_result_prms.i2_mv_x = mvx_qpel + 2;
2571
2.91M
                s_result_prms.i2_mv_y = mvy_qpel;
2572
2.91M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
2573
2.91M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2574
2575
2.91M
                pf_err_compute(&s_err_prms, &s_result_prms);
2576
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2577
2.91M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2578
2.91M
                if(i4_tot_cost < i4_min_cost)
2579
222k
                {
2580
222k
                    i4_min_cost = i4_tot_cost;
2581
222k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2582
222k
                    e_min_id = PT_R;
2583
222k
                    pu1_final_out = s_err_prms.pu1_ref;
2584
222k
                }
2585
2.91M
            }
2586
2.96M
        }
2587
        /* bottom pt */
2588
3.06M
        if(i4_grid_mask & BIT_EN(PT_B))
2589
2.96M
        {
2590
2.96M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2591
2.96M
                ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
2592
2.96M
            if(!check_for_duplicate)
2593
2.91M
            {
2594
                /* search node mv is stored in qpel units */
2595
2.91M
                ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
2596
2.91M
                ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
2597
2.91M
                i4_frac_x = i4_mv_x & 1;
2598
2.91M
                i4_frac_y = (i4_mv_y + 1) & 1;
2599
2.91M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2600
2.91M
                s_err_prms.pu1_ref =
2601
2.91M
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
2602
2603
                /* Update the mv's with the current candt motion vectors */
2604
2.91M
                s_result_prms.i2_mv_x = mvx_qpel;
2605
2.91M
                s_result_prms.i2_mv_y = mvy_qpel + 2;
2606
2.91M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2607
2.91M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
2608
2609
2.91M
                pf_err_compute(&s_err_prms, &s_result_prms);
2610
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2611
2.91M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2612
2.91M
                if(i4_tot_cost < i4_min_cost)
2613
156k
                {
2614
156k
                    i4_min_cost = i4_tot_cost;
2615
156k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2616
156k
                    e_min_id = PT_B;
2617
156k
                    pu1_final_out = s_err_prms.pu1_ref;
2618
156k
                }
2619
2.91M
            }
2620
2.96M
        }
2621
        /* Early exit in case of central point */
2622
3.06M
        if(e_min_id == PT_C)
2623
2.46M
            break;
2624
2625
        /*********************************************************************/
2626
        /* Depending on the best result location, we may be able to skip     */
2627
        /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
2628
        /* the best result, the next iteration need not do centre, left pts  */
2629
        /*********************************************************************/
2630
599k
        i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2631
599k
        i4_mv_x += gai1_grid_id_to_x[e_min_id];
2632
599k
        i4_mv_y += gai1_grid_id_to_y[e_min_id];
2633
599k
        ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2634
599k
        ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2635
599k
        i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2636
599k
    }
2637
2638
    /* Convert to QPEL units */
2639
2.62M
    i4_mv_x <<= 1;
2640
2.62M
    i4_mv_y <<= 1;
2641
2642
2.62M
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2643
2.62M
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2644
2645
    /* Exact interpolation or averaging chosen here */
2646
2.62M
    pf_qpel_interp = ps_prms->pf_qpel_interp;
2647
2648
    /* Next QPEL ME */
2649
    /* In this case, we have option of doing exact QPEL interpolation or avg */
2650
    /*************************************************************************/
2651
    /*        x                                                              */
2652
    /*    A b C d                                                            */
2653
    /*    e f g h                                                            */
2654
    /*    I j K l                                                            */
2655
    /*    m n o p                                                            */
2656
    /*    Q r S t                                                            */
2657
    /*                                                                       */
2658
    /*    Approximate QPEL logic                                             */
2659
    /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
2660
    /*    for any given pt, we can get all the information required about    */
2661
    /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
2662
    /*     surrounding pts info:                                             */
2663
    /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
2664
    /*           buffer 2: hxfy, offsets for both are 0, 0                   */
2665
    /*    similarly for other pts the info can be gotten                     */
2666
    /*************************************************************************/
2667
2.62M
    i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
2668
2.62M
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2669
2670
    /*************************************************************************/
2671
    /* One time preparation of non changing interpolation params. These      */
2672
    /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
2673
    /* working memory (not used though in case of averaging).                */
2674
    /*************************************************************************/
2675
2.62M
    s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
2676
2.62M
    s_interp_prms.i4_ref_stride = i4_ref_stride;
2677
2.62M
    s_interp_prms.i4_blk_wd = i4_blk_wd;
2678
2.62M
    s_interp_prms.i4_blk_ht = i4_blk_ht;
2679
2680
2.62M
    i4_final_out_stride = i4_ref_stride;
2681
2682
2.62M
    {
2683
2.62M
        U08 *pu1_mem;
2684
        /*********************************************************************/
2685
        /* Allocation of working memory for interpolated buffers. We maintain*/
2686
        /* an intermediate working buffer, and 2 ping pong interpolated out  */
2687
        /* buffers, purpose of ping pong explained later below               */
2688
        /*********************************************************************/
2689
2.62M
        pu1_mem = ps_prms->pu1_wkg_mem;
2690
2.62M
        s_interp_prms.pu1_wkg_mem = pu1_mem;
2691
2692
        //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
2693
2.62M
        s_interp_prms.apu1_interp_out[0] = pu1_mem;
2694
2695
2.62M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2696
2.62M
        s_interp_prms.apu1_interp_out[1] = pu1_mem;
2697
2698
2.62M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2699
2.62M
        s_interp_prms.apu1_interp_out[2] = pu1_mem;
2700
2701
2.62M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2702
2.62M
        s_interp_prms.apu1_interp_out[3] = pu1_mem;
2703
2704
2.62M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2705
2.62M
        s_interp_prms.apu1_interp_out[4] = pu1_mem;
2706
2707
        /*********************************************************************/
2708
        /* Stride of interpolated output is just a function of blk width of  */
2709
        /* this partition and hence remains constant for this partition      */
2710
        /*********************************************************************/
2711
2.62M
        s_interp_prms.i4_out_stride = (i4_blk_wd);
2712
2.62M
    }
2713
2714
2.62M
    {
2715
2.62M
        UWORD8 *apu1_final[4];
2716
2.62M
        WORD32 ai4_ref_stride[4];
2717
        /*************************************************************************/
2718
        /* Ping pong design for interpolated buffers. We use a min id, which     */
2719
        /* tracks the id of the ppu1_interp_out that stores the best result.     */
2720
        /* When new interp to be done, it uses 1 - bes result id to do the interp*/
2721
        /* min id is toggled when any new result becomes the best result.        */
2722
        /*************************************************************************/
2723
2724
3.00M
        for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
2725
2.81M
        {
2726
2.81M
            e_min_id = PT_C;
2727
2728
2.81M
            mvx_qpel = i4_mv_x;
2729
2.81M
            mvy_qpel = i4_mv_y;
2730
2.81M
            hme_qpel_interp_comprehensive(
2731
2.81M
                &s_interp_prms,
2732
2.81M
                apu1_final,
2733
2.81M
                ai4_ref_stride,
2734
2.81M
                i4_mv_x,
2735
2.81M
                i4_mv_y,
2736
2.81M
                i4_grid_mask,
2737
2.81M
                ps_me_optimised_function_list);
2738
2.81M
            if(i4_grid_mask & BIT_EN(PT_L))
2739
2.73M
            {
2740
2.73M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2741
2.73M
                    ps_dedup_enabler,
2742
2.73M
                    num_unique_nodes,
2743
2.73M
                    mvx_qpel - 1,
2744
2.73M
                    mvy_qpel - 0,
2745
2.73M
                    check_for_duplicate);
2746
2747
2.73M
                if(!check_for_duplicate)
2748
2.68M
                {
2749
2.68M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
2750
2.68M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2751
2752
2.68M
                    s_err_prms.pu1_ref = apu1_final[0];
2753
2.68M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[0];
2754
2755
                    /* Update the mv's with the current candt motion vectors */
2756
2.68M
                    s_result_prms.i2_mv_x = mvx_qpel - 1;
2757
2.68M
                    s_result_prms.i2_mv_y = mvy_qpel;
2758
2.68M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
2759
2.68M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2760
2761
2.68M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2762
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2763
2764
2.68M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2765
2.68M
                    if(i4_tot_cost < i4_min_cost)
2766
139k
                    {
2767
139k
                        e_min_id = PT_L;
2768
139k
                        i4_min_cost = i4_tot_cost;
2769
139k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2770
139k
                    }
2771
2.68M
                }
2772
2.73M
            }
2773
2.81M
            if(i4_grid_mask & BIT_EN(PT_T))
2774
2.73M
            {
2775
2.73M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2776
2.73M
                    ps_dedup_enabler,
2777
2.73M
                    num_unique_nodes,
2778
2.73M
                    mvx_qpel - 0,
2779
2.73M
                    mvy_qpel - 1,
2780
2.73M
                    check_for_duplicate);
2781
2782
2.73M
                if(!check_for_duplicate)
2783
2.68M
                {
2784
2.68M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2785
2.68M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
2786
2787
2.68M
                    s_err_prms.pu1_ref = apu1_final[1];
2788
2.68M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[1];
2789
2790
                    /* Update the mv's with the current candt motion vectors */
2791
2.68M
                    s_result_prms.i2_mv_x = mvx_qpel;
2792
2.68M
                    s_result_prms.i2_mv_y = mvy_qpel - 1;
2793
2794
2.68M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2795
2.68M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
2796
2797
2.68M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2798
2799
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2800
2.68M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2801
2.68M
                    if(i4_tot_cost < i4_min_cost)
2802
115k
                    {
2803
115k
                        e_min_id = PT_T;
2804
115k
                        i4_min_cost = i4_tot_cost;
2805
115k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2806
115k
                    }
2807
2.68M
                }
2808
2.73M
            }
2809
2.81M
            if(i4_grid_mask & BIT_EN(PT_R))
2810
2.74M
            {
2811
2.74M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2812
2.74M
                    ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
2813
2814
2.74M
                if(!check_for_duplicate)
2815
2.69M
                {
2816
2.69M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
2817
2.69M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2818
2819
2.69M
                    s_err_prms.pu1_ref = apu1_final[2];
2820
2.69M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[2];
2821
2822
                    /* Update the mv's with the current candt motion vectors */
2823
2.69M
                    s_result_prms.i2_mv_x = mvx_qpel + 1;
2824
2.69M
                    s_result_prms.i2_mv_y = mvy_qpel;
2825
2826
2.69M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
2827
2.69M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2828
2829
2.69M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2830
2831
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2832
2833
2.69M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2834
2.69M
                    if(i4_tot_cost < i4_min_cost)
2835
123k
                    {
2836
123k
                        e_min_id = PT_R;
2837
123k
                        i4_min_cost = i4_tot_cost;
2838
123k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2839
123k
                    }
2840
2.69M
                }
2841
2.74M
            }
2842
            /* i4_mv_x and i4_mv_y will always be the centre pt */
2843
            /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
2844
2.81M
            if(i4_grid_mask & BIT_EN(PT_B))
2845
2.74M
            {
2846
2.74M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2847
2.74M
                    ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
2848
2849
2.74M
                if(!check_for_duplicate)
2850
2.68M
                {
2851
2.68M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2852
2.68M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
2853
2854
2.68M
                    s_err_prms.pu1_ref = apu1_final[3];
2855
2.68M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[3];
2856
2857
                    /* Update the mv's with the current candt motion vectors */
2858
2.68M
                    s_result_prms.i2_mv_x = mvx_qpel;
2859
2.68M
                    s_result_prms.i2_mv_y = mvy_qpel + 1;
2860
2861
2.68M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2862
2.68M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
2863
2864
2.68M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2865
2866
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2867
2.68M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2868
2.68M
                    if(i4_tot_cost < i4_min_cost)
2869
97.6k
                    {
2870
97.6k
                        e_min_id = PT_B;
2871
97.6k
                        i4_min_cost = i4_tot_cost;
2872
97.6k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2873
97.6k
                    }
2874
2.68M
                }
2875
2.74M
            }
2876
2877
            /* New QPEL mv x and y */
2878
2.81M
            if(e_min_id == PT_C)
2879
2.43M
                break;
2880
383k
            i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2881
383k
            i4_mv_x += gai1_grid_id_to_x[e_min_id];
2882
383k
            i4_mv_y += gai1_grid_id_to_y[e_min_id];
2883
383k
            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2884
383k
            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2885
383k
            i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2886
383k
        }
2887
2.62M
    }
2888
2889
    /* update modified motion vectors and cost at end of subpel */
2890
2.62M
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2891
2.62M
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2892
2.62M
    ps_search_node->i4_tot_cost = i4_min_cost;
2893
2.62M
    ps_search_node->i4_sad = i4_min_sad;
2894
2895
    /********************************************************************************/
2896
    /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
2897
    /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
2898
    /********************************************************************************/
2899
    //ps_pred_ctxt->lambda >>= 1;
2900
2901
2.62M
    return (i4_min_cost);
2902
2.62M
}
2903
#elif DIAMOND_GRID == 0
2904
S32 hme_subpel_refine_search_node_high_speed(
2905
    search_node_t *ps_search_node,
2906
    hme_subpel_prms_t *ps_prms,
2907
    layer_ctxt_t *ps_curr_layer,
2908
    BLK_SIZE_T e_blk_size,
2909
    S32 x_off,
2910
    S32 y_off,
2911
    search_results_t *ps_search_results,
2912
    S32 pred_lx,
2913
    S32 i4_part_mask,
2914
    S32 *pi4_valid_part_ids,
2915
    S32 search_idx,
2916
    subpel_dedup_enabler_t *ps_dedup_enabler,
2917
    me_func_selector_t *ps_func_selector)
2918
{
2919
    S32 i4_num_hpel_refine, i4_num_qpel_refine;
2920
    S32 i4_offset, i4_grid_mask;
2921
    S08 i1_ref_idx;
2922
    S32 i4_blk_wd, i4_blk_ht;
2923
    S32 i4_ref_stride, i4_i;
2924
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2925
    result_upd_prms_t s_result_prms;
2926
2927
    /*************************************************************************/
2928
    /* Tracks current MV with the fractional component.                      */
2929
    /*************************************************************************/
2930
    S32 i4_mv_x, i4_mv_y;
2931
    S32 i4_frac_x, i4_frac_y;
2932
2933
    /*************************************************************************/
2934
    /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2935
    /* This function                                                         */
2936
    /*************************************************************************/
2937
    PF_SAD_FXN_T pf_err_compute;
2938
    S32 ai4_sad_grid[9][17], i4_tot_cost;
2939
    err_prms_t s_err_prms;
2940
2941
    /*************************************************************************/
2942
    /* Allowed MV RANGE                                                      */
2943
    /*************************************************************************/
2944
    range_prms_t *ps_range_prms;
2945
2946
    /*************************************************************************/
2947
    /* stores min id in grid with associated min cost.                       */
2948
    /*************************************************************************/
2949
    S32 i4_min_cost, i4_min_sad;
2950
    GRID_PT_T e_min_id;
2951
2952
    PF_INTERP_FXN_T pf_qpel_interp;
2953
    /*************************************************************************/
2954
    /* For hpel and qpel we move in diamonds and hence each point in the     */
2955
    /* diamond will belong to a completely different plane. To simplify the  */
2956
    /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2957
    /* hpel planes which are interpolated during recon.                      */
2958
    /*************************************************************************/
2959
    U08 *apu1_hpel_ref[4], *pu1_ref;
2960
2961
    interp_prms_t s_interp_prms;
2962
2963
    /*************************************************************************/
2964
    /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2965
    /* points to the corresponding predicted buf with its stride.            */
2966
    /* Note that the pointer cannot be derived just from the id, since the   */
2967
    /* pointer may also point to the hpel buffer (in case we request interp  */
2968
    /* of a hpel pt, which already exists in the recon hpel planes)          */
2969
    /*************************************************************************/
2970
    U08 *pu1_final_out;
2971
    S32 i4_final_out_stride;
2972
    S32 part_id;
2973
    S32 check_for_duplicate = 0;
2974
2975
    S32 mvx_qpel;
2976
    S32 mvy_qpel;
2977
2978
    /*************************************************************************/
2979
    /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
2980
    /* fixed through this subpel refinement for this partition.              */
2981
    /* Note, we do not enable grid sads since each pt is different buffers.  */
2982
    /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled.   */
2983
    /*************************************************************************/
2984
    if(ps_prms->i4_use_satd)
2985
    {
2986
        pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
2987
    }
2988
    else
2989
    {
2990
        pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
2991
    }
2992
2993
    i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2994
    i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2995
2996
    /* Prediction contet should now deal with qpel units */
2997
    HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2998
2999
    /* Buffer allocation for subpel */
3000
    /* Current design is that there may be many partitions and different mvs */
3001
    /* that attempt subpel refinemnt. While there is possibility of overlap, the */
3002
    /* hashing to detect and avoid overlap may be very complex. So, currently,   */
3003
    /* the only thing done is to store the eventual predicted buffer with every  */
3004
    /* ctb node that holds the result of hte best subpel search */
3005
3006
    /* Compute the base pointer for input, interpolated buffers */
3007
    /* The base pointers point as follows:
3008
    /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
3009
    /* To these, we need to add the offset of the current node */
3010
    i4_ref_stride = ps_curr_layer->i4_rec_stride;
3011
    i4_offset = x_off + (y_off * i4_ref_stride);
3012
    i1_ref_idx = ps_search_node->i1_ref_idx;
3013
3014
    apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
3015
    apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
3016
    apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
3017
    apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
3018
3019
    /* Initialize result params used for partition update */
3020
    s_result_prms.pf_mv_cost_compute = NULL;
3021
    s_result_prms.ps_search_results = ps_search_results;
3022
    s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
3023
    s_result_prms.i1_ref_idx = search_idx;
3024
    s_result_prms.i4_part_mask = i4_part_mask;
3025
    s_result_prms.ps_search_node_base = ps_search_node;
3026
    s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3027
    s_result_prms.i4_grid_mask = 1;
3028
3029
    /* convert to hpel units */
3030
    i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
3031
    i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
3032
3033
    /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
3034
    ps_range_prms = ps_prms->ps_mv_range_qpel;
3035
    i4_grid_mask = (GRID_ALL_PTS_VALID);
3036
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3037
3038
    i4_min_cost = MAX_32BIT_VAL;
3039
    i4_min_sad = MAX_32BIT_VAL;
3040
3041
    /*************************************************************************/
3042
    /* Prepare the input params to SAD/SATD function. Note that input is     */
3043
    /* passed from the calling funcion since it may be I (normal subpel      */
3044
    /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
3045
    /* Both cases are handled here.                                          */
3046
    /*************************************************************************/
3047
    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
3048
    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
3049
    s_err_prms.i4_ref_stride = i4_ref_stride;
3050
    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
3051
    s_err_prms.i4_grid_mask = 1;
3052
    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3053
    s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
3054
    s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
3055
3056
    /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
3057
    //ps_pred_ctxt->lambda <<= 1;
3058
    part_id = ps_search_node->u1_part_id;
3059
    for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
3060
    {
3061
        e_min_id = PT_C;
3062
3063
        mvx_qpel = i4_mv_x << 1;
3064
        mvy_qpel = i4_mv_y << 1;
3065
3066
        /* Central pt */
3067
        if(i4_grid_mask & BIT_EN(PT_C))
3068
        {
3069
            //ps_search_node->i2_mv_x = (S16)i4_mv_x;
3070
            //ps_search_node->i2_mv_x = (S16)i4_mv_y;
3071
            /* central pt is i4_mv_x, i4_mv_y */
3072
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3073
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
3074
3075
            i4_frac_x = i4_mv_x & 1;
3076
            i4_frac_y = i4_mv_y & 1;
3077
            pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3078
            s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3079
            pf_err_compute(&s_err_prms);
3080
            /* Update the mv's with the current candt motion vectors */
3081
            s_result_prms.i2_mv_x = mvx_qpel;
3082
            s_result_prms.i2_mv_y = mvy_qpel;
3083
            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3084
            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3085
            if(i4_tot_cost < i4_min_cost)
3086
            {
3087
                i4_min_cost = i4_tot_cost;
3088
                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3089
                e_min_id = PT_C;
3090
                pu1_final_out = s_err_prms.pu1_ref;
3091
            }
3092
        }
3093
3094
        /* left pt */
3095
        if(i4_grid_mask & BIT_EN(PT_L))
3096
        {
3097
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3098
                ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
3099
3100
            if(!check_for_duplicate)
3101
            {
3102
                /* search node mv is stored in qpel units */
3103
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
3104
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3105
                /* central pt is i4_mv_x - 1, i4_mv_y */
3106
                i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
3107
                i4_frac_y = i4_mv_y & 1;
3108
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3109
                s_err_prms.pu1_ref =
3110
                    pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3111
3112
                pf_err_compute(&s_err_prms);
3113
                /* Update the mv's with the current candt motion vectors */
3114
                s_result_prms.i2_mv_x = mvx_qpel;
3115
                s_result_prms.i2_mv_y = mvy_qpel;
3116
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3117
3118
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3119
3120
                if(i4_tot_cost < i4_min_cost)
3121
                {
3122
                    i4_min_cost = i4_tot_cost;
3123
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3124
                    e_min_id = PT_L;
3125
                    pu1_final_out = s_err_prms.pu1_ref;
3126
                }
3127
            }
3128
        }
3129
        /* top pt */
3130
        if(i4_grid_mask & BIT_EN(PT_T))
3131
        {
3132
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3133
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
3134
3135
            if(!check_for_duplicate)
3136
            {
3137
                /* search node mv is stored in qpel units */
3138
                ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3139
                ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
3140
                /* top pt is i4_mv_x, i4_mv_y - 1 */
3141
                i4_frac_x = i4_mv_x & 1;
3142
                i4_frac_y = (i4_mv_y - 1) & 1;
3143
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3144
                s_err_prms.pu1_ref =
3145
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
3146
                pf_err_compute(&s_err_prms);
3147
                /* Update the mv's with the current candt motion vectors */
3148
                s_result_prms.i2_mv_x = mvx_qpel;
3149
                s_result_prms.i2_mv_y = mvy_qpel - 2;
3150
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3151
3152
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3153
3154
                if(i4_tot_cost < i4_min_cost)
3155
                {
3156
                    i4_min_cost = i4_tot_cost;
3157
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3158
                    e_min_id = PT_T;
3159
                    pu1_final_out = s_err_prms.pu1_ref;
3160
                }
3161
            }
3162
        }
3163
        /* right pt */
3164
        if(i4_grid_mask & BIT_EN(PT_R))
3165
        {
3166
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3167
                ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
3168
3169
            if(!check_for_duplicate)
3170
            {
3171
                /* search node mv is stored in qpel units */
3172
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
3173
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3174
                /* right pt is i4_mv_x + 1, i4_mv_y */
3175
                i4_frac_x = (i4_mv_x + 1) & 1;
3176
                i4_frac_y = i4_mv_y & 1;
3177
3178
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3179
                s_err_prms.pu1_ref =
3180
                    pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3181
                pf_err_compute(&s_err_prms);
3182
                /* Update the mv's with the current candt motion vectors */
3183
                s_result_prms.i2_mv_x = mvx_qpel + 2;
3184
                s_result_prms.i2_mv_y = mvy_qpel;
3185
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3186
3187
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3188
3189
                if(i4_tot_cost < i4_min_cost)
3190
                {
3191
                    i4_min_cost = i4_tot_cost;
3192
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3193
                    e_min_id = PT_R;
3194
                    pu1_final_out = s_err_prms.pu1_ref;
3195
                }
3196
            }
3197
        }
3198
        /* bottom pt */
3199
        if(i4_grid_mask & BIT_EN(PT_B))
3200
        {
3201
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3202
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
3203
3204
            if(!check_for_duplicate)
3205
            {
3206
                /* search node mv is stored in qpel units */
3207
                ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
3208
                ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
3209
                i4_frac_x = i4_mv_x & 1;
3210
                i4_frac_y = (i4_mv_y + 1) & 1;
3211
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3212
                s_err_prms.pu1_ref =
3213
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
3214
3215
                pf_err_compute(&s_err_prms);
3216
                /* Update the mv's with the current candt motion vectors */
3217
                s_result_prms.i2_mv_x = mvx_qpel;
3218
                s_result_prms.i2_mv_y = mvy_qpel + 2;
3219
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3220
3221
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3222
3223
                if(i4_tot_cost < i4_min_cost)
3224
                {
3225
                    i4_min_cost = i4_tot_cost;
3226
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3227
                    e_min_id = PT_B;
3228
                    pu1_final_out = s_err_prms.pu1_ref;
3229
                }
3230
            }
3231
        }
3232
        if(e_min_id == PT_C)
3233
        {
3234
            if(!i4_i)
3235
            {
3236
                /* TL pt */
3237
                if(i4_grid_mask & BIT_EN(PT_TL))
3238
                {
3239
                    S32 mvx_minus_1 = (i4_mv_x - 1);
3240
                    S32 mvy_minus_1 = (i4_mv_y - 1);
3241
3242
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3243
                        ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
3244
3245
                    if(!check_for_duplicate)
3246
                    {
3247
                        /* search node mv is stored in qpel units */
3248
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3249
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3250
                        i4_frac_x = mvx_minus_1 & 1;
3251
                        i4_frac_y = mvy_minus_1 & 1;
3252
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3253
                        s_err_prms.pu1_ref =
3254
                            pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3255
3256
                        pf_err_compute(&s_err_prms);
3257
                        /* Update the mv's with the current candt motion vectors */
3258
                        s_result_prms.i2_mv_x = mvx_qpel - 2;
3259
                        s_result_prms.i2_mv_y = mvy_qpel - 2;
3260
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3261
3262
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3263
3264
                        if(i4_tot_cost < i4_min_cost)
3265
                        {
3266
                            i4_min_cost = i4_tot_cost;
3267
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3268
                            e_min_id = PT_TL;
3269
                            pu1_final_out = s_err_prms.pu1_ref;
3270
                        }
3271
                    }
3272
                }
3273
                /* TR pt */
3274
                if(i4_grid_mask & BIT_EN(PT_TR))
3275
                {
3276
                    S32 mvx_plus_1 = (i4_mv_x + 1);
3277
                    S32 mvy_minus_1 = (i4_mv_y - 1);
3278
3279
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3280
                        ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
3281
3282
                    if(!check_for_duplicate)
3283
                    {
3284
                        /* search node mv is stored in qpel units */
3285
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3286
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3287
                        i4_frac_x = mvx_plus_1 & 1;
3288
                        i4_frac_y = mvy_minus_1 & 1;
3289
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3290
                        s_err_prms.pu1_ref =
3291
                            pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3292
3293
                        pf_err_compute(&s_err_prms);
3294
                        /* Update the mv's with the current candt motion vectors */
3295
                        s_result_prms.i2_mv_x = mvx_qpel + 2;
3296
                        s_result_prms.i2_mv_y = mvy_qpel - 2;
3297
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3298
3299
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3300
3301
                        if(i4_tot_cost < i4_min_cost)
3302
                        {
3303
                            i4_min_cost = i4_tot_cost;
3304
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3305
                            e_min_id = PT_TR;
3306
                            pu1_final_out = s_err_prms.pu1_ref;
3307
                        }
3308
                    }
3309
                }
3310
                /* BL pt */
3311
                if(i4_grid_mask & BIT_EN(PT_BL))
3312
                {
3313
                    S32 mvx_minus_1 = (i4_mv_x - 1);
3314
                    S32 mvy_plus_1 = (i4_mv_y + 1);
3315
3316
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3317
                        ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
3318
3319
                    if(!check_for_duplicate)
3320
                    {
3321
                        /* search node mv is stored in qpel units */
3322
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3323
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3324
                        i4_frac_x = mvx_minus_1 & 1;
3325
                        i4_frac_y = mvy_plus_1 & 1;
3326
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3327
                        s_err_prms.pu1_ref =
3328
                            pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3329
3330
                        pf_err_compute(&s_err_prms);
3331
                        /* Update the mv's with the current candt motion vectors */
3332
                        s_result_prms.i2_mv_x = mvx_qpel - 2;
3333
                        s_result_prms.i2_mv_y = mvy_qpel + 2;
3334
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3335
3336
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3337
3338
                        if(i4_tot_cost < i4_min_cost)
3339
                        {
3340
                            i4_min_cost = i4_tot_cost;
3341
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3342
                            e_min_id = PT_BL;
3343
                            pu1_final_out = s_err_prms.pu1_ref;
3344
                        }
3345
                    }
3346
                }
3347
                /* BR pt */
3348
                if(i4_grid_mask & BIT_EN(PT_BR))
3349
                {
3350
                    S32 mvx_plus_1 = (i4_mv_x + 1);
3351
                    S32 mvy_plus_1 = (i4_mv_y + 1);
3352
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3353
                        ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
3354
3355
                    if(!check_for_duplicate)
3356
                    {
3357
                        /* search node mv is stored in qpel units */
3358
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3359
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3360
                        i4_frac_x = mvx_plus_1 & 1;
3361
                        i4_frac_y = mvy_plus_1 & 1;
3362
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3363
                        s_err_prms.pu1_ref =
3364
                            pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3365
3366
                        pf_err_compute(&s_err_prms);
3367
                        /* Update the mv's with the current candt motion vectors */
3368
                        s_result_prms.i2_mv_x = mvx_qpel + 2;
3369
                        s_result_prms.i2_mv_y = mvy_qpel + 2;
3370
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3371
3372
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3373
3374
                        if(i4_tot_cost < i4_min_cost)
3375
                        {
3376
                            i4_min_cost = i4_tot_cost;
3377
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3378
                            e_min_id = PT_BR;
3379
                            pu1_final_out = s_err_prms.pu1_ref;
3380
                        }
3381
                    }
3382
                }
3383
                if(e_min_id == PT_C)
3384
                {
3385
                    break;
3386
                }
3387
            }
3388
            else
3389
            {
3390
                break;
3391
            }
3392
        }
3393
3394
        /*********************************************************************/
3395
        /* Depending on the best result location, we may be able to skip     */
3396
        /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
3397
        /* the best result, the next iteration need not do centre, left pts  */
3398
        /*********************************************************************/
3399
        if(i4_i)
3400
        {
3401
            i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3402
        }
3403
        else
3404
        {
3405
            i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3406
        }
3407
        i4_mv_x += gai1_grid_id_to_x[e_min_id];
3408
        i4_mv_y += gai1_grid_id_to_y[e_min_id];
3409
        ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3410
        ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3411
        i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3412
    }
3413
3414
    /* Convert to QPEL units */
3415
    i4_mv_x <<= 1;
3416
    i4_mv_y <<= 1;
3417
3418
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3419
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3420
3421
    /* Early exit if this partition is visiting same hpel mv again */
3422
    /* Assumption : Checkin for early exit in best result of partition */
3423
    if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
3424
        ps_search_node->s_mv.i2_mvx) &&
3425
       (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
3426
        ps_search_node->s_mv.i2_mvy))
3427
    {
3428
        return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
3429
    }
3430
    else
3431
    {
3432
        /* Store the best hpel mv for future early exit checks */
3433
        ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
3434
            (S16)i4_mv_x;
3435
        ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
3436
            (S16)i4_mv_y;
3437
    }
3438
3439
    /* Early exit if this partition is visiting same hpel mv again */
3440
    /* Assumption : Checkin for early exit in second best result of partition */
3441
    if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
3442
        ps_search_node->s_mv.i2_mvx) &&
3443
       (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
3444
        ps_search_node->s_mv.i2_mvy))
3445
    {
3446
        return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
3447
    }
3448
    else
3449
    {
3450
        /* Store the best hpel mv for future early exit checks */
3451
        ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
3452
            (S16)i4_mv_x;
3453
        ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
3454
            (S16)i4_mv_y;
3455
    }
3456
3457
    /* Exact interpolation or averaging chosen here */
3458
    pf_qpel_interp = ps_prms->pf_qpel_interp;
3459
3460
    /* Next QPEL ME */
3461
    /* In this case, we have option of doing exact QPEL interpolation or avg */
3462
    /*************************************************************************/
3463
    /*        x                                                              */
3464
    /*    A b C d                                                            */
3465
    /*    e f g h                                                            */
3466
    /*    I j K l                                                            */
3467
    /*    m n o p                                                            */
3468
    /*    Q r S t                                                            */
3469
    /*                                                                       */
3470
    /*    Approximate QPEL logic                                             */
3471
    /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
3472
    /*    for any given pt, we can get all the information required about    */
3473
    /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
3474
    /*     surrounding pts info:                                             */
3475
    /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
3476
    /*           buffer 2: hxfy, offsets for both are 0, 0                   */
3477
    /*    similarly for other pts the info can be gotten                     */
3478
    /*************************************************************************/
3479
    i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
3480
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3481
3482
    /*************************************************************************/
3483
    /* One time preparation of non changing interpolation params. These      */
3484
    /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
3485
    /* working memory (not used though in case of averaging).                */
3486
    /*************************************************************************/
3487
    s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
3488
    s_interp_prms.i4_ref_stride = i4_ref_stride;
3489
    s_interp_prms.i4_blk_wd = i4_blk_wd;
3490
    s_interp_prms.i4_blk_ht = i4_blk_ht;
3491
3492
    i4_final_out_stride = i4_ref_stride;
3493
3494
    {
3495
        U08 *pu1_mem;
3496
        /*********************************************************************/
3497
        /* Allocation of working memory for interpolated buffers. We maintain*/
3498
        /* an intermediate working buffer, and 2 ping pong interpolated out  */
3499
        /* buffers, purpose of ping pong explained later below               */
3500
        /*********************************************************************/
3501
        pu1_mem = ps_prms->pu1_wkg_mem;
3502
        s_interp_prms.pu1_wkg_mem = pu1_mem;
3503
3504
        //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
3505
        s_interp_prms.apu1_interp_out[0] = pu1_mem;
3506
3507
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3508
        s_interp_prms.apu1_interp_out[1] = pu1_mem;
3509
3510
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3511
        s_interp_prms.apu1_interp_out[2] = pu1_mem;
3512
3513
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3514
        s_interp_prms.apu1_interp_out[3] = pu1_mem;
3515
3516
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3517
        s_interp_prms.apu1_interp_out[4] = pu1_mem;
3518
3519
        /*********************************************************************/
3520
        /* Stride of interpolated output is just a function of blk width of  */
3521
        /* this partition and hence remains constant for this partition      */
3522
        /*********************************************************************/
3523
        s_interp_prms.i4_out_stride = (i4_blk_wd);
3524
    }
3525
3526
    {
3527
        UWORD8 *apu1_final[4];
3528
        WORD32 ai4_ref_stride[4];
3529
        /*************************************************************************/
3530
        /* Ping pong design for interpolated buffers. We use a min id, which     */
3531
        /* tracks the id of the ppu1_interp_out that stores the best result.     */
3532
        /* When new interp to be done, it uses 1 - bes result id to do the interp*/
3533
        /* min id is toggled when any new result becomes the best result.        */
3534
        /*************************************************************************/
3535
3536
        for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
3537
        {
3538
            e_min_id = PT_C;
3539
3540
            hme_qpel_interp_comprehensive(
3541
                &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
3542
3543
            mvx_qpel = i4_mv_x;
3544
            mvy_qpel = i4_mv_y;
3545
3546
            if(i4_grid_mask & BIT_EN(PT_L))
3547
            {
3548
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3549
                    ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
3550
3551
                if(!check_for_duplicate)
3552
                {
3553
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3554
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3555
3556
                    s_err_prms.pu1_ref = apu1_final[0];
3557
                    s_err_prms.i4_ref_stride = ai4_ref_stride[0];
3558
3559
                    pf_err_compute(&s_err_prms);
3560
                    /* Update the mv's with the current candt motion vectors */
3561
                    s_result_prms.i2_mv_x = mvx_qpel - 1;
3562
                    s_result_prms.i2_mv_y = mvy_qpel;
3563
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3564
3565
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3566
                    if(i4_tot_cost < i4_min_cost)
3567
                    {
3568
                        e_min_id = PT_L;
3569
                        i4_min_cost = i4_tot_cost;
3570
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3571
                    }
3572
                }
3573
            }
3574
            if(i4_grid_mask & BIT_EN(PT_T))
3575
            {
3576
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3577
                    ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
3578
3579
                if(!check_for_duplicate)
3580
                {
3581
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3582
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3583
3584
                    s_err_prms.pu1_ref = apu1_final[1];
3585
                    s_err_prms.i4_ref_stride = ai4_ref_stride[1];
3586
3587
                    pf_err_compute(&s_err_prms);
3588
                    /* Update the mv's with the current candt motion vectors */
3589
                    s_result_prms.i2_mv_x = mvx_qpel;
3590
                    s_result_prms.i2_mv_y = mvy_qpel - 1;
3591
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3592
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3593
                    if(i4_tot_cost < i4_min_cost)
3594
                    {
3595
                        e_min_id = PT_T;
3596
                        i4_min_cost = i4_tot_cost;
3597
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3598
                    }
3599
                }
3600
            }
3601
            if(i4_grid_mask & BIT_EN(PT_R))
3602
            {
3603
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3604
                    ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
3605
3606
                if(!check_for_duplicate)
3607
                {
3608
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3609
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3610
3611
                    s_err_prms.pu1_ref = apu1_final[2];
3612
                    s_err_prms.i4_ref_stride = ai4_ref_stride[2];
3613
3614
                    pf_err_compute(&s_err_prms);
3615
                    /* Update the mv's with the current candt motion vectors */
3616
                    s_result_prms.i2_mv_x = mvx_qpel + 1;
3617
                    s_result_prms.i2_mv_y = mvy_qpel;
3618
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3619
3620
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3621
                    if(i4_tot_cost < i4_min_cost)
3622
                    {
3623
                        e_min_id = PT_R;
3624
                        i4_min_cost = i4_tot_cost;
3625
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3626
                    }
3627
                }
3628
            }
3629
            /* i4_mv_x and i4_mv_y will always be the centre pt */
3630
            /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3631
            if(i4_grid_mask & BIT_EN(PT_B))
3632
            {
3633
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3634
                    ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
3635
3636
                if(!check_for_duplicate)
3637
                {
3638
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3639
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3640
3641
                    s_err_prms.pu1_ref = apu1_final[3];
3642
                    s_err_prms.i4_ref_stride = ai4_ref_stride[3];
3643
3644
                    pf_err_compute(&s_err_prms);
3645
                    /* Update the mv's with the current candt motion vectors */
3646
                    s_result_prms.i2_mv_x = mvx_qpel;
3647
                    s_result_prms.i2_mv_y = mvy_qpel + 1;
3648
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3649
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3650
                    if(i4_tot_cost < i4_min_cost)
3651
                    {
3652
                        e_min_id = PT_B;
3653
                        i4_min_cost = i4_tot_cost;
3654
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3655
                    }
3656
                }
3657
            }
3658
3659
            if(e_min_id == PT_C)
3660
            {
3661
                if(!i4_i)
3662
                {
3663
                    S32 i4_interp_buf_id = 0;
3664
3665
                    if(i4_grid_mask & BIT_EN(PT_TL))
3666
                    {
3667
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3668
                            ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
3669
3670
                        if(!check_for_duplicate)
3671
                        {
3672
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3673
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3674
3675
                            /* Carry out the interpolation */
3676
                            pf_qpel_interp(
3677
                                &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
3678
3679
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3680
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3681
3682
                            pf_err_compute(&s_err_prms);
3683
                            /* Update the mv's with the current candt motion vectors */
3684
                            s_result_prms.i2_mv_x = mvx_qpel - 1;
3685
                            s_result_prms.i2_mv_y = mvy_qpel - 1;
3686
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3687
3688
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3689
3690
                            if(i4_tot_cost < i4_min_cost)
3691
                            {
3692
                                e_min_id = PT_TL;
3693
                                i4_min_cost = i4_tot_cost;
3694
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3695
                            }
3696
                        }
3697
                    }
3698
                    if(i4_grid_mask & BIT_EN(PT_TR))
3699
                    {
3700
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3701
                            ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
3702
3703
                        if(!check_for_duplicate)
3704
                        {
3705
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3706
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3707
3708
                            /* Carry out the interpolation */
3709
                            pf_qpel_interp(
3710
                                &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
3711
3712
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3713
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3714
3715
                            pf_err_compute(&s_err_prms);
3716
                            /* Update the mv's with the current candt motion vectors */
3717
                            s_result_prms.i2_mv_x = mvx_qpel + 1;
3718
                            s_result_prms.i2_mv_y = mvy_qpel - 1;
3719
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3720
3721
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3722
3723
                            if(i4_tot_cost < i4_min_cost)
3724
                            {
3725
                                e_min_id = PT_TR;
3726
                                i4_min_cost = i4_tot_cost;
3727
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3728
                            }
3729
                        }
3730
                    }
3731
                    if(i4_grid_mask & BIT_EN(PT_BL))
3732
                    {
3733
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3734
                            ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
3735
3736
                        if(!check_for_duplicate)
3737
                        {
3738
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3739
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3740
3741
                            /* Carry out the interpolation */
3742
                            pf_qpel_interp(
3743
                                &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
3744
3745
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3746
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3747
3748
                            pf_err_compute(&s_err_prms);
3749
                            /* Update the mv's with the current candt motion vectors */
3750
                            s_result_prms.i2_mv_x = mvx_qpel - 1;
3751
                            s_result_prms.i2_mv_y = mvy_qpel + 1;
3752
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3753
3754
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3755
3756
                            if(i4_tot_cost < i4_min_cost)
3757
                            {
3758
                                e_min_id = PT_BL;
3759
                                i4_min_cost = i4_tot_cost;
3760
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3761
                            }
3762
                        }
3763
                    }
3764
                    /* i4_mv_x and i4_mv_y will always be the centre pt */
3765
                    /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3766
                    if(i4_grid_mask & BIT_EN(PT_BR))
3767
                    {
3768
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3769
                            ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
3770
3771
                        if(!check_for_duplicate)
3772
                        {
3773
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3774
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3775
3776
                            /* Carry out the interpolation */
3777
                            pf_qpel_interp(
3778
                                &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
3779
3780
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3781
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3782
3783
                            pf_err_compute(&s_err_prms);
3784
                            /* Update the mv's with the current candt motion vectors */
3785
                            s_result_prms.i2_mv_x = mvx_qpel + 1;
3786
                            s_result_prms.i2_mv_y = mvy_qpel + 1;
3787
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3788
3789
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3790
3791
                            if(i4_tot_cost < i4_min_cost)
3792
                            {
3793
                                e_min_id = PT_BR;
3794
                                i4_min_cost = i4_tot_cost;
3795
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3796
                            }
3797
                        }
3798
                    }
3799
                    if(e_min_id == PT_C)
3800
                    {
3801
                        break;
3802
                    }
3803
                }
3804
                else
3805
                {
3806
                    break;
3807
                }
3808
            }
3809
3810
            if(i4_i)
3811
            {
3812
                i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3813
            }
3814
            else
3815
            {
3816
                i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3817
            }
3818
            i4_mv_x += gai1_grid_id_to_x[e_min_id];
3819
            i4_mv_y += gai1_grid_id_to_y[e_min_id];
3820
            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3821
            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3822
            i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3823
        }
3824
    }
3825
3826
    /* update modified motion vectors and cost at end of subpel */
3827
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3828
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3829
    ps_search_node->i4_tot_cost = i4_min_cost;
3830
    ps_search_node->i4_sad = i4_min_sad;
3831
3832
    /********************************************************************************/
3833
    /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
3834
    /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
3835
    /********************************************************************************/
3836
    //ps_pred_ctxt->lambda >>= 1;
3837
3838
    return (i4_min_cost);
3839
}
3840
#endif
3841
3842
static void hme_subpel_refine_struct_to_search_results_struct_converter(
3843
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
3844
    search_results_t *ps_search_results,
3845
    U08 u1_pred_dir,
3846
    ME_QUALITY_PRESETS_T e_quality_preset)
3847
2.36M
{
3848
2.36M
    U08 i;
3849
3850
2.36M
    U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
3851
3852
23.3M
    for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
3853
20.9M
    {
3854
20.9M
        S32 index;
3855
20.9M
        S32 i4_sad;
3856
3857
20.9M
        S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
3858
3859
20.9M
        search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
3860
3861
20.9M
        if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
3862
19.5M
        {
3863
19.5M
            index = part_id;
3864
19.5M
        }
3865
1.47M
        else
3866
1.47M
        {
3867
1.47M
            index = i;
3868
1.47M
        }
3869
3870
20.9M
        if(!ps_best_node->u1_subpel_done)
3871
11.4M
        {
3872
11.4M
            i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3873
11.4M
                     ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3874
11.4M
            ps_best_node[0].i4_sdi = 0;
3875
11.4M
            ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
3876
11.4M
            ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3877
3878
11.4M
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3879
2.11k
            {
3880
2.11k
                i4_sad = MAX_SIGNED_16BIT_VAL;
3881
2.11k
            }
3882
3883
11.4M
            ps_best_node[0].i4_sad = i4_sad;
3884
11.4M
            ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3885
11.4M
            ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3886
11.4M
            ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3887
11.4M
            ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3888
11.4M
            ps_best_node->u1_subpel_done = 1;
3889
3890
11.4M
            if(2 == u1_num_results_per_part)
3891
0
            {
3892
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3893
0
                         ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3894
0
                ps_best_node[1].i4_sdi = 0;
3895
0
                ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3896
3897
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3898
0
                {
3899
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3900
0
                }
3901
3902
0
                ps_best_node[1].i4_sad = i4_sad;
3903
0
                ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3904
0
                ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3905
0
                ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3906
0
                ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3907
0
                ps_best_node[1].u1_subpel_done = 1;
3908
0
            }
3909
11.4M
        }
3910
9.53M
        else if(
3911
9.53M
            (2 == u1_num_results_per_part) &&
3912
0
            (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
3913
0
        {
3914
0
            if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
3915
0
            {
3916
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3917
0
                         ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3918
0
                ps_best_node[0].i4_sdi = 0;
3919
0
                ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3920
3921
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3922
0
                {
3923
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3924
0
                }
3925
3926
0
                ps_best_node[0].i4_sad = i4_sad;
3927
0
                ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3928
0
                ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3929
0
                ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3930
0
                ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3931
3932
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3933
0
                         ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3934
0
                ps_best_node[1].i4_sdi = 0;
3935
0
                ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3936
3937
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3938
0
                {
3939
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3940
0
                }
3941
3942
0
                ps_best_node[1].i4_sad = i4_sad;
3943
0
                ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3944
0
                ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3945
0
                ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3946
0
                ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3947
0
            }
3948
0
            else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
3949
0
            {
3950
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
3951
0
                {
3952
0
                    i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3953
0
                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3954
0
                    ps_best_node[1].i4_sdi = 0;
3955
0
                    ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3956
3957
0
                    if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3958
0
                    {
3959
0
                        i4_sad = MAX_SIGNED_16BIT_VAL;
3960
0
                    }
3961
3962
0
                    ps_best_node[1].i4_sad = i4_sad;
3963
0
                    ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3964
0
                    ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3965
0
                    ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3966
0
                    ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3967
0
                }
3968
0
                else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
3969
0
                {
3970
0
                    memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
3971
3972
0
                    i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3973
0
                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3974
0
                    ps_best_node[0].i4_sdi = 0;
3975
0
                    ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3976
3977
0
                    if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3978
0
                    {
3979
0
                        i4_sad = MAX_SIGNED_16BIT_VAL;
3980
0
                    }
3981
3982
0
                    ps_best_node[0].i4_sad = i4_sad;
3983
0
                    ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3984
0
                    ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3985
0
                    ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3986
0
                    ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3987
0
                }
3988
0
            }
3989
0
        }
3990
9.53M
        else if(
3991
9.53M
            (1 == u1_num_results_per_part) &&
3992
9.53M
            (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
3993
416k
        {
3994
416k
            i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3995
416k
                     ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3996
416k
            ps_best_node[0].i4_sdi = 0;
3997
416k
            ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3998
3999
416k
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
4000
0
            {
4001
0
                i4_sad = MAX_SIGNED_16BIT_VAL;
4002
0
            }
4003
4004
416k
            ps_best_node[0].i4_sad = i4_sad;
4005
416k
            ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4006
416k
            ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4007
416k
            ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4008
416k
            ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
4009
416k
        }
4010
20.9M
    }
4011
2.36M
}
4012
4013
/**
4014
********************************************************************************
4015
*  @fn     S32 hme_subpel_refine_cu_hs
4016
*
4017
*  @brief  Evaluates the best subpel mvs for active partitions of an MB in L0
4018
*          layer for the high speed preset. Recursive hadamard SATD / SAD
4019
*          and mv cost is used for 2NxN and NxN partitions with active partition
4020
*          update
4021
*
4022
*  @param[in]  ps_prms: subpel prms input to this function
4023
*
4024
*  @param[in]  ps_curr_layer: points to the current layer ctxt
4025
*
4026
*  @param[out] ps_search_results: points to the search resutls that get updated
4027
*              with best results
4028
*
4029
*  @param[in]  search_idx:  ref id of the frame for which results get updated
4030
*
4031
*  @param[in]  ps_wt_inp_prms:  current frame input params
4032
*
4033
*  @return     None
4034
********************************************************************************
4035
*/
4036
void hme_subpel_refine_cu_hs(
4037
    hme_subpel_prms_t *ps_prms,
4038
    layer_ctxt_t *ps_curr_layer,
4039
    search_results_t *ps_search_results,
4040
    S32 search_idx,
4041
    wgt_pred_ctxt_t *ps_wt_inp_prms,
4042
    WORD32 blk_8x8_mask,
4043
    me_func_selector_t *ps_func_selector,
4044
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
4045
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
4046
2.36M
{
4047
    /* Unique search node list for 2nx2n and nxn partitions */
4048
2.36M
    search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
4049
2.36M
    subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
4050
2.36M
    search_node_t *ps_search_node;
4051
4052
2.36M
    S32 i, i4_part_mask, j;
4053
2.36M
    S32 i4_sad_grid;
4054
2.36M
    S32 max_subpel_cand;
4055
2.36M
    WORD32 index;
4056
2.36M
    S32 num_unique_nodes_2nx2n;
4057
2.36M
    S32 part_id;
4058
2.36M
    S32 x_off, y_off;
4059
2.36M
    S32 i4_inp_off;
4060
4061
2.36M
    CU_SIZE_T e_cu_size;
4062
2.36M
    BLK_SIZE_T e_blk_size;
4063
4064
2.36M
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
4065
4066
2.36M
    S32 i4_use_satd = ps_prms->i4_use_satd;
4067
2.36M
    S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
4068
4069
2.36M
    ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
4070
4071
2.36M
    if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
4072
2.36M
    {
4073
2.36M
        e_cu_size = ps_search_results->e_cu_size;
4074
2.36M
        i4_part_mask = ps_search_results->i4_part_mask;
4075
4076
2.36M
        ps_prms->i4_inp_type = sizeof(U08);
4077
4078
2.36M
        num_unique_nodes_2nx2n = 0;
4079
4080
8.22M
        for(i = 0; i < i4_num_act_refs; i++)
4081
5.86M
        {
4082
5.86M
            as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
4083
5.86M
        }
4084
4085
        /************************************************************************/
4086
        /*                                                                      */
4087
        /*  Initialize SATD cost for each valid partition id.one time before    */
4088
        /*  doing full pel time. This is because of the following reasons:      */
4089
        /*   1. Full pel cost was done in  SAD while subpel is in SATD mode     */
4090
        /*   2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
4091
        /*      doing Diamond search for 2Nx2N and NxN. This partitions are     */
4092
        /*      not explicitly refine in high speed mode                        */
4093
        /*                                                                      */
4094
        /************************************************************************/
4095
23.3M
        for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4096
20.9M
        {
4097
20.9M
            S32 enable_subpel = 0;
4098
20.9M
            S32 part_type;
4099
4100
            /* Derive the x and y offsets of this part id */
4101
20.9M
            part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4102
20.9M
            if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4103
19.5M
            {
4104
19.5M
                index = part_id;
4105
19.5M
            }
4106
1.47M
            else
4107
1.47M
            {
4108
1.47M
                index = i;
4109
1.47M
            }
4110
4111
20.9M
            part_type = ge_part_id_to_part_type[part_id];
4112
20.9M
            x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
4113
20.9M
            y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
4114
20.9M
            x_off += ps_search_results->u1_x_off;
4115
20.9M
            y_off += ps_search_results->u1_y_off;
4116
20.9M
            i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4117
20.9M
            e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
4118
4119
20.9M
            x_off += ps_prms->i4_ctb_x_off;
4120
20.9M
            y_off += ps_prms->i4_ctb_y_off;
4121
4122
20.9M
            max_subpel_cand = 0;
4123
4124
            /* Choose the minimum number of candidates to be used for Sub pel refinement */
4125
20.9M
            if(PART_ID_2Nx2N == part_type)
4126
2.30M
            {
4127
2.30M
                max_subpel_cand =
4128
2.30M
                    MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
4129
2.30M
                        ps_search_results->u1_num_results_per_part);
4130
2.30M
            }
4131
18.6M
            else if(PRT_NxN == part_type)
4132
4.88M
            {
4133
4.88M
                max_subpel_cand = MIN(
4134
4.88M
                    ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
4135
4.88M
            }
4136
4137
            /* If incomplete CTB, NxN num candidates should be forced to min 1 */
4138
20.9M
            if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
4139
72.0k
            {
4140
72.0k
                max_subpel_cand = 1;
4141
72.0k
            }
4142
4143
20.9M
            if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
4144
7.19M
            {
4145
7.19M
                enable_subpel = 1;
4146
7.19M
            }
4147
4148
            /* Compute full pel SATD for each result per partition before subpel */
4149
            /* refinement starts.                                                */
4150
            /* Also prepare unique candidate list for 2Nx2N and NxN partitions   */
4151
41.9M
            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4152
20.9M
            {
4153
20.9M
                err_prms_t s_err_prms;
4154
20.9M
                S32 i4_satd = 0;
4155
20.9M
                S32 i1_ref_idx;
4156
20.9M
                U08 *pu1_ref_base;
4157
20.9M
                S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
4158
20.9M
                S32 i4_mv_x, i4_mv_y;
4159
4160
20.9M
                ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
4161
4162
20.9M
                if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
4163
0
                {
4164
0
                    ps_search_node->u1_subpel_done = 1;
4165
0
                    continue;
4166
0
                }
4167
4168
20.9M
                i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4169
20.9M
                ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
4170
20.9M
                pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
4171
4172
20.9M
                i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
4173
20.9M
                i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
4174
4175
20.9M
                if(i4_use_satd)
4176
19.9M
                {
4177
19.9M
                    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
4178
19.9M
                    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
4179
19.9M
                    s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
4180
19.9M
                                         (i4_mv_y * i4_ref_stride);
4181
4182
19.9M
                    s_err_prms.i4_ref_stride = i4_ref_stride;
4183
19.9M
                    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
4184
19.9M
                    s_err_prms.i4_grid_mask = 1;
4185
19.9M
                    s_err_prms.pi4_sad_grid = &i4_sad_grid;
4186
19.9M
                    s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
4187
19.9M
                    s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
4188
4189
19.9M
                    s_err_prms.ps_cmn_utils_optimised_function_list =
4190
19.9M
                        ps_cmn_utils_optimised_function_list;
4191
4192
19.9M
                    compute_satd_8bit(&s_err_prms);
4193
4194
19.9M
                    i4_satd = s_err_prms.pi4_sad_grid[0];
4195
4196
19.9M
                    ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
4197
19.9M
                        CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
4198
19.9M
                    ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
4199
19.9M
                }
4200
4201
                /* Sub-pel candidate filtration */
4202
20.9M
                if(j)
4203
0
                {
4204
0
                    S16 i2_best_sad;
4205
0
                    S32 i4_best_mvx;
4206
0
                    S32 i4_best_mvy;
4207
4208
0
                    search_node_t *ps_node =
4209
0
                        ps_search_results->aps_part_results[search_idx][part_id];
4210
4211
0
                    U08 u1_is_subpel_done = ps_node->u1_subpel_done;
4212
0
                    S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
4213
0
                    S32 i4_curr_mvx = i4_mv_x << 2;
4214
0
                    S32 i4_curr_mvy = i4_mv_y << 2;
4215
4216
0
                    if(u1_is_subpel_done)
4217
0
                    {
4218
0
                        i2_best_sad = ps_node->i4_sad;
4219
4220
0
                        if(ps_node->i1_ref_idx == i1_ref_idx)
4221
0
                        {
4222
0
                            i4_best_mvx = ps_node->s_mv.i2_mvx;
4223
0
                            i4_best_mvy = ps_node->s_mv.i2_mvy;
4224
0
                        }
4225
0
                        else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4226
0
                        {
4227
0
                            i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4228
0
                            i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4229
0
                        }
4230
0
                        else
4231
0
                        {
4232
0
                            i4_best_mvx = INTRA_MV;
4233
0
                            i4_best_mvy = INTRA_MV;
4234
0
                        }
4235
0
                    }
4236
0
                    else
4237
0
                    {
4238
0
                        i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
4239
0
                                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4240
4241
0
                        if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4242
0
                        {
4243
0
                            i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4244
0
                            i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4245
0
                        }
4246
0
                        else
4247
0
                        {
4248
0
                            i4_best_mvx = INTRA_MV;
4249
0
                            i4_best_mvy = INTRA_MV;
4250
0
                        }
4251
0
                    }
4252
4253
0
                    i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
4254
4255
0
                    if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
4256
0
                        (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
4257
0
                       (i2_curr_sad > i2_best_sad))
4258
0
                    {
4259
0
                        enable_subpel = 0;
4260
0
                    }
4261
0
                }
4262
4263
20.9M
                ps_search_node->u1_part_id = part_id;
4264
4265
                /* Convert mvs in part results from FPEL to QPEL units */
4266
20.9M
                ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
4267
20.9M
                ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
4268
4269
                /* If the candidate number is more than the number of candts
4270
                set initally, do not add those candts for refinement */
4271
20.9M
                if(j >= max_subpel_cand)
4272
13.8M
                {
4273
13.8M
                    enable_subpel = 0;
4274
13.8M
                }
4275
4276
20.9M
                if(enable_subpel)
4277
7.14M
                {
4278
7.14M
                    if(num_unique_nodes_2nx2n == 0)
4279
2.36M
                    {
4280
2.36M
                        S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4281
4282
2.36M
                        as_subpel_dedup_enabler[i4_index].i2_mv_x =
4283
2.36M
                            ps_subpel_refine_ctxt->i2_mv_x[j][index];
4284
2.36M
                        as_subpel_dedup_enabler[i4_index].i2_mv_y =
4285
2.36M
                            ps_subpel_refine_ctxt->i2_mv_y[j][index];
4286
2.36M
                        as_subpel_dedup_enabler[i4_index].u1_ref_idx =
4287
2.36M
                            (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4288
2.36M
                        memset(
4289
2.36M
                            as_subpel_dedup_enabler[i4_index].au4_node_map,
4290
2.36M
                            0,
4291
2.36M
                            sizeof(U32) * 2 * MAP_X_MAX);
4292
2.36M
                    }
4293
7.14M
                    INSERT_NEW_NODE_NOMAP_ALTERNATE(
4294
7.14M
                        as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
4295
7.14M
                }
4296
20.9M
            }
4297
4298
            /*********************************************************************************************/
4299
            /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
4300
            /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
4301
            /* for each partition again, based on the new costs                                          */
4302
            /*********************************************************************************************/
4303
            /*********************************************************************************************/
4304
            /* Because right now, we store only the two best candidates for each partition, the sort will*/
4305
            /* converge to a simple swap.                                                                */
4306
            /* ASSUMPTION : We store only two best results per partition                                 */
4307
            /*********************************************************************************************/
4308
20.9M
            if(ps_search_results->u1_num_results_per_part == 2)
4309
0
            {
4310
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
4311
0
                   ps_subpel_refine_ctxt->i2_tot_cost[1][index])
4312
0
                {
4313
0
                    SWAP(
4314
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index],
4315
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
4316
4317
0
                    SWAP(
4318
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index],
4319
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
4320
4321
0
                    SWAP(
4322
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index],
4323
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index]);
4324
4325
0
                    SWAP(
4326
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index],
4327
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index]);
4328
4329
0
                    SWAP(
4330
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index],
4331
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
4332
4333
0
                    SWAP(
4334
0
                        ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
4335
0
                        ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
4336
0
                }
4337
0
            }
4338
20.9M
        }
4339
4340
2.36M
        if(blk_8x8_mask == 0xf)
4341
2.30M
        {
4342
2.30M
            num_unique_nodes_2nx2n =
4343
2.30M
                MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
4344
2.30M
        }
4345
2.36M
        {
4346
2.36M
            x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
4347
2.36M
            y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
4348
2.36M
            x_off += ps_search_results->u1_x_off;
4349
2.36M
            y_off += ps_search_results->u1_y_off;
4350
2.36M
            i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4351
2.36M
            e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
4352
4353
4.98M
            for(j = 0; j < num_unique_nodes_2nx2n; j++)
4354
2.62M
            {
4355
2.62M
                S32 pred_lx;
4356
2.62M
                ps_search_node = &as_nodes_2nx2n[j];
4357
4358
2.62M
                if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
4359
0
                {
4360
0
                    continue;
4361
0
                }
4362
4363
2.62M
                {
4364
2.62M
                    S08 i1_ref_idx = ps_search_node->i1_ref_idx;
4365
2.62M
                    subpel_dedup_enabler_t *ps_dedup_enabler =
4366
2.62M
                        &(as_subpel_dedup_enabler[i1_ref_idx]);
4367
4368
2.62M
                    if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
4369
8.37k
                    {
4370
8.37k
                        as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
4371
8.37k
                        as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
4372
8.37k
                        as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
4373
8.37k
                        memset(
4374
8.37k
                            as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
4375
8.37k
                            0,
4376
8.37k
                            sizeof(U32) * 2 * MAP_X_MAX);
4377
8.37k
                    }
4378
2.62M
                }
4379
4380
2.62M
                pred_lx = search_idx;
4381
2.62M
                ps_prms->pv_inp =
4382
2.62M
                    (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
4383
4384
2.62M
                hme_subpel_refine_search_node_high_speed(
4385
2.62M
                    ps_search_node,
4386
2.62M
                    ps_prms,
4387
2.62M
                    ps_curr_layer,
4388
2.62M
                    e_blk_size,
4389
2.62M
                    x_off + ps_prms->i4_ctb_x_off,
4390
2.62M
                    y_off + ps_prms->i4_ctb_y_off,
4391
2.62M
                    ps_search_results,
4392
2.62M
                    pred_lx,
4393
2.62M
                    i4_part_mask,
4394
2.62M
                    &ps_subpel_refine_ctxt->ai4_part_id[0],
4395
2.62M
                    search_idx,
4396
2.62M
                    &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
4397
2.62M
                    ps_func_selector,
4398
2.62M
                    ps_me_optimised_function_list);
4399
2.62M
            }
4400
2.36M
        }
4401
2.36M
    }
4402
0
    else
4403
0
    {
4404
0
        for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4405
0
        {
4406
0
            S32 i4_index;
4407
4408
0
            S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4409
4410
0
            if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4411
0
            {
4412
0
                i4_index = i4_part_id;
4413
0
            }
4414
0
            else
4415
0
            {
4416
0
                i4_index = i;
4417
0
            }
4418
4419
0
            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4420
0
            {
4421
0
                ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
4422
0
                ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
4423
0
            }
4424
0
        }
4425
0
    }
4426
4427
2.36M
    hme_subpel_refine_struct_to_search_results_struct_converter(
4428
2.36M
        ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
4429
2.36M
}