Coverage Report

Created: 2026-06-10 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/hme_err_compute.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/*!
22
***************************************************************************
23
* \file hme_err_compute.c
24
*
25
* \brief
26
*    SAD / SATD routines for error computation
27
*
28
* Detailed_description : Contains various types of SAD/SATD routines for
29
*   error computation between a given input and reference ptr. The SAD
30
*   routines can evaluate for either a single point or a grid, and can
31
*   evaluate with either partial updates or no partial updates. Partial
32
*   updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33
*   addition to the main 8x8 block SAD.
34
*
35
* \date
36
*    22/9/2012
37
*
38
* \author  Ittiam
39
***************************************************************************
40
*/
41
42
/*****************************************************************************/
43
/* File Includes                                                             */
44
/*****************************************************************************/
45
/* System include files */
46
#include <stdio.h>
47
#include <string.h>
48
#include <stdlib.h>
49
#include <assert.h>
50
#include <stdarg.h>
51
#include <math.h>
52
#include <limits.h>
53
54
/* User include files */
55
#include "ihevc_typedefs.h"
56
#include "itt_video_api.h"
57
#include "ihevce_api.h"
58
59
#include "rc_cntrl_param.h"
60
#include "rc_frame_info_collector.h"
61
#include "rc_look_ahead_params.h"
62
63
#include "ihevc_defs.h"
64
#include "ihevc_structs.h"
65
#include "ihevc_platform_macros.h"
66
#include "ihevc_deblk.h"
67
#include "ihevc_itrans_recon.h"
68
#include "ihevc_chroma_itrans_recon.h"
69
#include "ihevc_chroma_intra_pred.h"
70
#include "ihevc_intra_pred.h"
71
#include "ihevc_inter_pred.h"
72
#include "ihevc_mem_fns.h"
73
#include "ihevc_padding.h"
74
#include "ihevc_weighted_pred.h"
75
#include "ihevc_sao.h"
76
#include "ihevc_resi_trans.h"
77
#include "ihevc_quant_iquant_ssd.h"
78
#include "ihevc_cabac_tables.h"
79
80
#include "ihevce_defs.h"
81
#include "ihevce_lap_enc_structs.h"
82
#include "ihevce_multi_thrd_structs.h"
83
#include "ihevce_multi_thrd_funcs.h"
84
#include "ihevce_me_common_defs.h"
85
#include "ihevce_had_satd.h"
86
#include "ihevce_error_codes.h"
87
#include "ihevce_bitstream.h"
88
#include "ihevce_cabac.h"
89
#include "ihevce_rdoq_macros.h"
90
#include "ihevce_function_selector.h"
91
#include "ihevce_enc_structs.h"
92
#include "ihevce_entropy_structs.h"
93
#include "ihevce_cmn_utils_instr_set_router.h"
94
#include "ihevce_enc_loop_structs.h"
95
#include "ihevce_bs_compute_ctb.h"
96
#include "ihevce_global_tables.h"
97
#include "ihevce_dep_mngr_interface.h"
98
#include "hme_datatype.h"
99
#include "hme_interface.h"
100
#include "hme_common_defs.h"
101
#include "hme_defs.h"
102
#include "ihevce_me_instr_set_router.h"
103
#include "hme_globals.h"
104
#include "hme_utils.h"
105
#include "hme_coarse.h"
106
#include "hme_refine.h"
107
#include "hme_err_compute.h"
108
#include "hme_common_utils.h"
109
#include "hme_search_algo.h"
110
#include "ihevce_stasino_helpers.h"
111
112
/******************************************************************************
113
*                         MACRO DEFINITIONS
114
******************************************************************************/
115
116
/*****************************************************************************/
117
/* Theoritically, the various types of SAD functions that are needed for     */
118
/* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119
/* more optimal than SADs that are to be evaluated for a grid of 3x3. The    */
120
/* SADs to be evaluated at a grid are classified as separate functions, since*/
121
/* evaluating them on a single function call helps reuse inputs for a small  */
122
/* grid of 3x3. Also, if no partial updates are required, there are 3 basic  */
123
/* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124
/* 16K, K any number. For partial updates, it is assumed that the block size */
125
/* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done  */
126
/* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127
/* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then  */
128
/* basic SAD unit is 8x8.                                                    */
129
/*****************************************************************************/
130
131
#define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132
#define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133
#define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134
#define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135
#define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136
#define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137
#define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138
#define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139
140
/*******************************************************************************
141
*                         FUNCTION DEFINITIONS
142
*******************************************************************************/
143
S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144
1.43M
{
145
1.43M
    if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146
849k
       (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147
751k
       (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148
751k
    {
149
751k
        return 0;
150
751k
    }
151
681k
    return -1;
152
1.43M
}
153
154
void compute_4x4_sads_for_16x16_blk(
155
    grid_ctxt_t *ps_grid, /* Grid ctxt */
156
    UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157
    WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158
    UWORD16 **
159
        u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160
    cand_t *ps_cand, /* Return the list of candidates evaluated */
161
    WORD32 *num_cands /* Number of candidates that were processed */
162
)
163
0
{
164
0
    WORD32 a, b, c, d, i;
165
0
    WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166
0
    WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167
    //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168
    //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169
    /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170
0
    WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171
0
    WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172
0
    WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175
0
    cand_t *cand0 = ps_cand;
176
0
    UWORD16 au2_4x4_sad[NUM_4X4];
177
178
0
    *num_cands = 0;
179
180
    /* Loop to fill up the cand_t array and to calculate num_cands */
181
0
    for(i = 0; i < ps_grid->num_grids; i++)
182
0
    {
183
0
        WORD32 j;
184
0
        WORD32 mask = ps_grid->pi4_grd_mask[i];
185
0
        UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186
0
        WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187
0
        WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188
189
0
        for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190
0
        {
191
0
            if(mask & 1)
192
0
            {
193
0
                *num_cands = *num_cands + 1;
194
0
                cand0->grid_ix = i;
195
0
                cand0->ref_idx = ps_grid->p_ref_idx[i];
196
0
                cand0->pu1_ref_ptr =
197
0
                    pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198
0
                cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199
0
                cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200
0
                cand0++;
201
0
            }
202
0
        }
203
0
    }
204
205
    /* Loop to compute the SAD's */
206
0
    for(a = 0; a < *num_cands; a++)
207
0
    {
208
0
        cand_t *cand = ps_cand + a;
209
0
        memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210
0
        for(b = 0; b < NUM_4X4; b++)
211
0
        {
212
0
            WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213
0
            WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214
215
0
            for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216
0
            {
217
0
                WORD32 z_cur = (cur_buf_stride)*c + t1;
218
0
                WORD32 z_ref = (ref_buf_stride)*c + t2;
219
0
                for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220
0
                {
221
0
                    au2_4x4_sad[b] += (UWORD16)ABS(
222
0
                        (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223
0
                }
224
0
            }
225
0
        }
226
227
0
        u2_part_sads[PART_ID_NxN_TL][a] =
228
0
            (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229
0
        u2_part_sads[PART_ID_NxN_TR][a] =
230
0
            (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231
0
        u2_part_sads[PART_ID_NxN_BL][a] =
232
0
            (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233
0
        u2_part_sads[PART_ID_NxN_BR][a] =
234
0
            (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235
0
        u2_part_sads[PART_ID_Nx2N_L][a] =
236
0
            u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237
0
        u2_part_sads[PART_ID_Nx2N_R][a] =
238
0
            u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239
0
        u2_part_sads[PART_ID_2NxN_T][a] =
240
0
            u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241
0
        u2_part_sads[PART_ID_2NxN_B][a] =
242
0
            u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243
0
        u2_part_sads[PART_ID_nLx2N_L][a] =
244
0
            (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245
0
        u2_part_sads[PART_ID_nRx2N_R][a] =
246
0
            (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247
0
        u2_part_sads[PART_ID_2NxnU_T][a] =
248
0
            (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249
0
        u2_part_sads[PART_ID_2NxnD_B][a] =
250
0
            (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251
0
        u2_part_sads[PART_ID_2Nx2N][a] =
252
0
            u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253
0
        u2_part_sads[PART_ID_2NxnU_B][a] =
254
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255
0
        u2_part_sads[PART_ID_2NxnD_T][a] =
256
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257
0
        u2_part_sads[PART_ID_nRx2N_L][a] =
258
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259
0
        u2_part_sads[PART_ID_nLx2N_R][a] =
260
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261
0
    }
262
0
}
263
264
/**
265
********************************************************************************
266
*  @fn     compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267
*                                       UWORD8      *pu1_cur_ptr,
268
*                                       WORD32      cur_buf_stride,
269
*                                       WORD32     **pi4_part_sads,
270
*                                       cand_t      *ps_cand,
271
*                                       WORD32      *num_cands
272
*
273
*  @brief  Computes partial SADs and updates partition results for an MxM blk
274
*          and does so for several grids of points. This can be used for
275
*          32x32/64x64 blks with 17 partition updates
276
*
277
*
278
*  @param[in]  ps_grid : Pointer to grid ctxt that has multiple grid of max
279
*                        9 pts per grid
280
*
281
*  @param[in]  pu1_cur_ptr : Top left of input buffer
282
*
283
*  @param[in]  pi4_part_sads : array of pointers, each entry pointing to
284
*                             results to be updated for a given partition
285
*
286
*  @return   The ps_search_results structure has the best result updated for
287
*            the 2Nx2N partition alone.
288
289
********************************************************************************
290
*/
291
void compute_part_sads_for_MxM_blk(
292
    grid_ctxt_t *ps_grid,
293
    UWORD8 *pu1_cur_ptr,
294
    WORD32 cur_buf_stride,
295
    WORD32 **pp_part_sads,
296
    cand_t *ps_cand,
297
    WORD32 *num_cands,
298
    CU_SIZE_T e_cu_size)
299
1.61M
{
300
1.61M
    WORD32 a, b, c, d, i;
301
1.61M
    WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302
1.61M
    WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303
304
    /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305
1.61M
    WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306
1.61M
    WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307
1.61M
    WORD32 shift = (WORD32)e_cu_size;
308
309
1.61M
    WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310
1.61M
    WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311
1.61M
    WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312
    /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313
1.61M
    WORD32 num_rows_in_nxn = 2 << shift;
314
1.61M
    WORD32 num_pixels_in_row = 2 << shift;
315
1.61M
    cand_t *cand0 = ps_cand;
316
    /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317
    /* needed for AMP cases.                                              */
318
1.61M
    WORD32 a_nxn_sad[NUM_4X4];
319
1.61M
    *num_cands = 0;
320
321
    /* Loop to fill up the cand_t array and to calculate num_cands */
322
3.22M
    for(i = 0; i < ps_grid->num_grids; i++)
323
1.61M
    {
324
1.61M
        WORD32 j;
325
1.61M
        WORD32 mask = ps_grid->pi4_grd_mask[i];
326
1.61M
        UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327
1.61M
        WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328
1.61M
        WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329
330
16.1M
        for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331
14.5M
        {
332
14.5M
            if(mask & 1)
333
1.61M
            {
334
1.61M
                *num_cands = *num_cands + 1;
335
1.61M
                cand0->grid_ix = i;
336
1.61M
                cand0->ref_idx = ps_grid->p_ref_idx[i];
337
1.61M
                cand0->pu1_ref_ptr =
338
1.61M
                    pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339
1.61M
                cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340
1.61M
                cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341
1.61M
                cand0++;
342
1.61M
            }
343
14.5M
        }
344
1.61M
    }
345
346
    /* Loop to compute the SAD's */
347
3.22M
    for(a = 0; a < *num_cands; a++)
348
1.61M
    {
349
1.61M
        cand_t *cand = ps_cand + a;
350
1.61M
        memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351
27.4M
        for(b = 0; b < NUM_4X4; b++)
352
25.8M
        {
353
25.8M
            WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354
25.8M
            WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355
356
78.8M
            for(c = 0; c < num_rows_in_nxn; c++)
357
52.9M
            {
358
52.9M
                WORD32 z_cur = (cur_buf_stride)*c + t1;
359
52.9M
                WORD32 z_ref = (ref_buf_stride)*c + t2;
360
170M
                for(d = 0; d < num_pixels_in_row; d++)
361
117M
                {
362
117M
                    a_nxn_sad[b] += (WORD32)ABS(
363
117M
                        (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364
117M
                         ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365
117M
                }
366
52.9M
            }
367
25.8M
        }
368
369
1.61M
        pp_part_sads[PART_ID_NxN_TL][a] =
370
1.61M
            (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371
1.61M
        pp_part_sads[PART_ID_NxN_TR][a] =
372
1.61M
            (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373
1.61M
        pp_part_sads[PART_ID_NxN_BL][a] =
374
1.61M
            (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375
1.61M
        pp_part_sads[PART_ID_NxN_BR][a] =
376
1.61M
            (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377
1.61M
        pp_part_sads[PART_ID_Nx2N_L][a] =
378
1.61M
            pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379
1.61M
        pp_part_sads[PART_ID_Nx2N_R][a] =
380
1.61M
            pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381
1.61M
        pp_part_sads[PART_ID_2NxN_T][a] =
382
1.61M
            pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383
1.61M
        pp_part_sads[PART_ID_2NxN_B][a] =
384
1.61M
            pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385
1.61M
        pp_part_sads[PART_ID_nLx2N_L][a] =
386
1.61M
            (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387
1.61M
        pp_part_sads[PART_ID_nRx2N_R][a] =
388
1.61M
            (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389
1.61M
        pp_part_sads[PART_ID_2NxnU_T][a] =
390
1.61M
            (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391
1.61M
        pp_part_sads[PART_ID_2NxnD_B][a] =
392
1.61M
            (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393
1.61M
        pp_part_sads[PART_ID_2Nx2N][a] =
394
1.61M
            pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395
1.61M
        pp_part_sads[PART_ID_2NxnU_B][a] =
396
1.61M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397
1.61M
        pp_part_sads[PART_ID_2NxnD_T][a] =
398
1.61M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399
1.61M
        pp_part_sads[PART_ID_nRx2N_L][a] =
400
1.61M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401
1.61M
        pp_part_sads[PART_ID_nLx2N_R][a] =
402
1.61M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403
1.61M
    }
404
1.61M
}
405
406
void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407
0
{
408
0
    grid_ctxt_t s_grid;
409
0
    cand_t as_candt[9];
410
0
    U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411
0
    U16 *apu2_sad_grid[TOT_NUM_PARTS];
412
0
    hme_mv_t s_mv = { 0, 0 };
413
0
    S32 i4_ref_idx = 0, i;
414
0
    S32 num_candts = 0;
415
0
    s_grid.num_grids = 1;
416
0
    s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417
0
    s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418
0
    s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419
0
    s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420
0
    s_grid.p_mv = &s_mv;
421
0
    s_grid.p_ref_idx = &i4_ref_idx;
422
0
    for(i = 0; i < 9; i++)
423
0
    {
424
0
        if(s_grid.pi4_grd_mask[0] & (1 << i))
425
0
            num_candts++;
426
0
    }
427
428
0
    for(i = 0; i < TOT_NUM_PARTS; i++)
429
0
        apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430
431
0
    compute_4x4_sads_for_16x16_blk(
432
0
        &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433
0
    for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434
0
    {
435
0
        ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436
0
    }
437
0
}
438
439
void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440
832k
{
441
832k
    U08 *pu1_inp_base, *pu1_ref_c;
442
832k
    S32 *pi4_sad = ps_prms->pi4_sad_grid;
443
832k
    S32 i, grid_count = 0;
444
832k
    S32 step = ps_prms->i4_step;
445
832k
    S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446
447
832k
    ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448
449
    //assert(ps_prms->i4_blk_ht <= 8);
450
    //assert(ps_prms->i4_blk_wd <= 8);
451
8.32M
    for(i = 0; i < 9; i++)
452
7.48M
    {
453
7.48M
        if(ps_prms->i4_grid_mask & (1 << i))
454
7.04M
            grid_count++;
455
7.48M
    }
456
832k
    pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457
458
832k
    pu1_inp_base = ps_prms->pu1_inp;
459
832k
    pu1_ref_c = ps_prms->pu1_ref;
460
8.32M
    for(i = 0; i < 9; i++)
461
7.48M
    {
462
7.48M
        S32 sad = 0, j, k;
463
7.48M
        U08 *pu1_inp, *pu1_ref;
464
465
7.48M
        if(!(ps_prms->i4_grid_mask & (1 << i)))
466
447k
            continue;
467
7.04M
        pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468
7.04M
        pu1_ref += y_off * gai1_grid_id_to_y[i];
469
7.04M
        pu1_inp = pu1_inp_base;
470
471
49.2M
        for(j = 0; j < ps_prms->i4_blk_ht; j++)
472
42.2M
        {
473
267M
            for(k = 0; k < ps_prms->i4_blk_wd; k++)
474
225M
            {
475
225M
                sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476
225M
            }
477
42.2M
            pu1_inp += ps_prms->i4_inp_stride;
478
42.2M
            pu1_ref += ps_prms->i4_ref_stride;
479
42.2M
        }
480
7.04M
        *pi4_sad++ = sad;
481
7.04M
    }
482
832k
}
483
484
WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485
    WORD32 ht,
486
    WORD32 wd,
487
    UWORD8 *pu1_inp,
488
    UWORD8 *pu1_ref,
489
    WORD32 i4_inp_stride,
490
    WORD32 i4_ref_stride)
491
2.80M
{
492
2.80M
    WORD32 i, j;
493
2.80M
    WORD32 sad = 0;
494
23.9M
    for(i = 0; i < ht; i++)
495
21.1M
    {
496
184M
        for(j = 0; j < wd; j++)
497
163M
        {
498
163M
            sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499
163M
        }
500
21.1M
        pu1_inp += i4_inp_stride;
501
21.1M
        pu1_ref += i4_ref_stride;
502
21.1M
    }
503
2.80M
    return sad;
504
2.80M
}
505
506
void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507
2.80M
{
508
2.80M
    S32 wd, ht;
509
2.80M
    U08 *pu1_inp, *pu1_ref;
510
511
2.80M
    wd = ps_prms->i4_blk_wd;
512
2.80M
    ht = ps_prms->i4_blk_ht;
513
514
2.80M
    pu1_inp = ps_prms->pu1_inp;
515
2.80M
    pu1_ref = ps_prms->pu1_ref;
516
517
2.80M
    ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518
2.80M
        ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519
2.80M
}
520
521
void compute_satd_8bit(err_prms_t *ps_prms)
522
1.80M
{
523
1.80M
    U08 *pu1_origin;
524
1.80M
    S32 src_strd;
525
1.80M
    U08 *pu1_pred_buf;
526
1.80M
    S32 dst_strd;
527
1.80M
    S32 wd, ht;
528
1.80M
    U32 u4_sad = 0;
529
1.80M
    WORD32 x, y;
530
1.80M
    U08 *u1_pi0, *u1_pi1;
531
532
1.80M
    pu1_origin = ps_prms->pu1_inp;
533
1.80M
    pu1_pred_buf = ps_prms->pu1_ref;
534
1.80M
    src_strd = ps_prms->i4_inp_stride;
535
1.80M
    dst_strd = ps_prms->i4_ref_stride;
536
1.80M
    wd = ps_prms->i4_blk_wd;
537
1.80M
    ht = ps_prms->i4_blk_ht;
538
539
1.80M
    u1_pi0 = pu1_origin;
540
1.80M
    u1_pi1 = pu1_pred_buf;
541
542
    /* Follows the following logic:
543
    For block sizes less than or equal to 16X16, the basic transform size is 4x4
544
    For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545
1.80M
    if((wd > 0x10) || (ht > 0x10))
546
74.0k
    {
547
328k
        for(y = 0; y < ht; y += 8)
548
253k
        {
549
1.24M
            for(x = 0; x < wd; x += 8)
550
989k
            {
551
989k
                u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552
989k
                    &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553
989k
            }
554
253k
            u1_pi0 += src_strd * 8;
555
253k
            u1_pi1 += dst_strd * 8;
556
253k
        }
557
74.0k
    }
558
1.72M
    else
559
1.72M
    {
560
6.45M
        for(y = 0; y < ht; y += 4)
561
4.72M
        {
562
18.1M
            for(x = 0; x < wd; x += 4)
563
13.4M
            {
564
13.4M
                u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565
13.4M
                    &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566
13.4M
            }
567
4.72M
            u1_pi0 += src_strd * 4;
568
4.72M
            u1_pi1 += dst_strd * 4;
569
4.72M
        }
570
1.72M
    }
571
572
1.80M
    ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573
1.80M
}
574
575
void hme_init_pred_part(
576
    pred_ctxt_t *ps_pred_ctxt,
577
    search_node_t *ps_tl,
578
    search_node_t *ps_t,
579
    search_node_t *ps_tr,
580
    search_node_t *ps_l,
581
    search_node_t *ps_bl,
582
    search_node_t *ps_coloc,
583
    search_node_t *ps_zeromv,
584
    search_node_t **pps_proj_coloc,
585
    PART_ID_T e_part_id)
586
1.46M
{
587
1.46M
    pred_candt_nodes_t *ps_candt_nodes;
588
589
1.46M
    ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590
591
1.46M
    ps_candt_nodes->ps_tl = ps_tl;
592
1.46M
    ps_candt_nodes->ps_tr = ps_tr;
593
1.46M
    ps_candt_nodes->ps_t = ps_t;
594
1.46M
    ps_candt_nodes->ps_l = ps_l;
595
1.46M
    ps_candt_nodes->ps_bl = ps_bl;
596
1.46M
    ps_candt_nodes->ps_coloc = ps_coloc;
597
1.46M
    ps_candt_nodes->ps_zeromv = ps_zeromv;
598
1.46M
    ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599
1.46M
}
600
601
void hme_init_pred_ctxt_no_encode(
602
    pred_ctxt_t *ps_pred_ctxt,
603
    search_results_t *ps_search_results,
604
    search_node_t *ps_top_candts,
605
    search_node_t *ps_left_candts,
606
    search_node_t **pps_proj_coloc_candts,
607
    search_node_t *ps_coloc_candts,
608
    search_node_t *ps_zeromv_candt,
609
    S32 pred_lx,
610
    S32 lambda,
611
    S32 lambda_q_shift,
612
    U08 **ppu1_ref_bits_tlu,
613
    S16 *pi2_ref_scf)
614
11.8k
{
615
11.8k
    search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616
11.8k
    search_node_t *ps_coloc;
617
11.8k
    PART_ID_T e_part_id;
618
619
    /* Assume that resolution is subpel to begin with */
620
11.8k
    ps_pred_ctxt->mv_pel = 0;  // FPEL
621
622
    /* lambda and pred_lx (PRED_L0/PRED_L1) */
623
11.8k
    ps_pred_ctxt->lambda = lambda;
624
11.8k
    ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625
11.8k
    ps_pred_ctxt->pred_lx = pred_lx;
626
11.8k
    ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627
11.8k
    ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628
11.8k
    ps_pred_ctxt->proj_used = 0;
629
630
    /* Bottom left should not be valid */
631
11.8k
    ASSERT(ps_left_candts[2].u1_is_avail == 0);
632
11.8k
    ps_invalid = &ps_left_candts[2];
633
634
    /*************************************************************************/
635
    /* for the case of no encode, the idea is to set up cants as follows     */
636
    /*                                                                       */
637
    /*    ____ ______________                                                */
638
    /*   | TL | T  | T1 | TR |                                               */
639
    /*   |____|____|____|____|                                               */
640
    /*   | L  | b0 | b1 |                                                    */
641
    /*   |____|____|____|                                                    */
642
    /*   | L1 | b2 | b3 |                                                    */
643
    /*   |____|____|____|                                                    */
644
    /*   | BL |                                                              */
645
    /*   |____|                                                              */
646
    /*                                                                       */
647
    /*  If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1      */
648
    /* and L=L1. topleft, top and topright are TL,T,TR respectively          */
649
    /* Left and bottom left is L and BL respectively.                        */
650
    /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8)  */
651
    /*  For the 4 subblocks (partids 4-7)                                    */
652
    /*                                                                       */
653
    /*  Block   Left   Top   Top Left   Top Right   Bottom Left             */
654
    /*    b0    L      T      TL          T1          L1                     */
655
    /*    b1    b0     T1     T           TR          BL(invalid)            */
656
    /*    b2    L1     b0     L0          b1          BL (invalid)           */
657
    /*    b3    b2     b1     b0          BL(inv)     BL (inv)               */
658
    /*                                                                       */
659
    /* Note : For block b1, bottom left pts to b2, which is not yet ready    */
660
    /*  hence it is kept invalid and made to pt to BL. For block b3 top rt   */
661
    /* is invalid and hence made to pt to BL which is invalid.               */
662
    /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663
    /*************************************************************************/
664
665
    /* ps_coloc always points to a fixe candt (global) */
666
    /* TODO : replace incoming ps_coloc from global to geniune coloc */
667
11.8k
    ps_coloc = ps_coloc_candts;
668
669
    /* INITIALIZATION OF 8x8 BLK */
670
11.8k
    ps_tl = ps_top_candts;
671
11.8k
    ps_t = ps_tl + 2;
672
11.8k
    ps_tr = ps_t + 1;
673
11.8k
    ps_l = ps_left_candts + 1;
674
11.8k
    ps_bl = ps_invalid;
675
11.8k
    e_part_id = PART_ID_2Nx2N;
676
11.8k
    hme_init_pred_part(
677
11.8k
        ps_pred_ctxt,
678
11.8k
        ps_tl,
679
11.8k
        ps_t,
680
11.8k
        ps_tr,
681
11.8k
        ps_l,
682
11.8k
        ps_bl,
683
11.8k
        ps_coloc,
684
11.8k
        ps_zeromv_candt,
685
11.8k
        pps_proj_coloc_candts,
686
11.8k
        e_part_id);
687
688
    /* INITIALIZATION OF 4x4 TL BLK */
689
11.8k
    e_part_id = PART_ID_NxN_TL;
690
11.8k
    ps_tl = ps_top_candts;
691
11.8k
    ps_t = ps_tl + 1;
692
11.8k
    ps_tr = ps_t + 1;
693
11.8k
    ps_l = ps_left_candts;
694
11.8k
    ps_bl = ps_l + 1;
695
11.8k
    hme_init_pred_part(
696
11.8k
        ps_pred_ctxt,
697
11.8k
        ps_tl,
698
11.8k
        ps_t,
699
11.8k
        ps_tr,
700
11.8k
        ps_l,
701
11.8k
        ps_bl,
702
11.8k
        ps_coloc,
703
11.8k
        ps_zeromv_candt,
704
11.8k
        pps_proj_coloc_candts,
705
11.8k
        e_part_id);
706
707
    /* INITIALIZATION OF 4x4 TR BLK */
708
11.8k
    e_part_id = PART_ID_NxN_TR;
709
11.8k
    ps_tl = ps_top_candts + 1;
710
11.8k
    ps_t = ps_tl + 1;
711
11.8k
    ps_tr = ps_t + 1;
712
11.8k
    ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713
11.8k
    ps_bl = ps_invalid;
714
11.8k
    hme_init_pred_part(
715
11.8k
        ps_pred_ctxt,
716
11.8k
        ps_tl,
717
11.8k
        ps_t,
718
11.8k
        ps_tr,
719
11.8k
        ps_l,
720
11.8k
        ps_bl,
721
11.8k
        ps_coloc,
722
11.8k
        ps_zeromv_candt,
723
11.8k
        pps_proj_coloc_candts,
724
11.8k
        e_part_id);
725
726
    /* INITIALIZATION OF 4x4 BL BLK */
727
11.8k
    e_part_id = PART_ID_NxN_BL;
728
11.8k
    ps_tl = ps_left_candts;
729
11.8k
    ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730
11.8k
    ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731
11.8k
    ps_l = ps_left_candts + 1;
732
11.8k
    ps_bl = ps_invalid;  //invalid
733
11.8k
    hme_init_pred_part(
734
11.8k
        ps_pred_ctxt,
735
11.8k
        ps_tl,
736
11.8k
        ps_t,
737
11.8k
        ps_tr,
738
11.8k
        ps_l,
739
11.8k
        ps_bl,
740
11.8k
        ps_coloc,
741
11.8k
        ps_zeromv_candt,
742
11.8k
        pps_proj_coloc_candts,
743
11.8k
        e_part_id);
744
745
    /* INITIALIZATION OF 4x4 BR BLK */
746
11.8k
    e_part_id = PART_ID_NxN_BR;
747
11.8k
    ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748
11.8k
    ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749
11.8k
    ps_tr = ps_invalid;  // invalid
750
11.8k
    ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751
11.8k
    ps_bl = ps_invalid;  // invalid
752
11.8k
    hme_init_pred_part(
753
11.8k
        ps_pred_ctxt,
754
11.8k
        ps_tl,
755
11.8k
        ps_t,
756
11.8k
        ps_tr,
757
11.8k
        ps_l,
758
11.8k
        ps_bl,
759
11.8k
        ps_coloc,
760
11.8k
        ps_zeromv_candt,
761
11.8k
        pps_proj_coloc_candts,
762
11.8k
        e_part_id);
763
11.8k
}
764
765
void hme_init_pred_ctxt_encode(
766
    pred_ctxt_t *ps_pred_ctxt,
767
    search_results_t *ps_search_results,
768
    search_node_t *ps_coloc_candts,
769
    search_node_t *ps_zeromv_candt,
770
    mv_grid_t *ps_mv_grid,
771
    S32 pred_lx,
772
    S32 lambda,
773
    S32 lambda_q_shift,
774
    U08 **ppu1_ref_bits_tlu,
775
    S16 *pi2_ref_scf)
776
82.9k
{
777
82.9k
    search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778
82.9k
    search_node_t *ps_coloc;
779
82.9k
    search_node_t *ps_grid_cu_base;
780
82.9k
    CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781
782
    /* Part Start, Part sizes in 4x4 units */
783
82.9k
    S32 part_wd, part_ht, part_start_x, part_start_y;
784
785
    /* Partition type, number of partitions in type */
786
82.9k
    S32 part_id;
787
788
    /* Coordinates of the CU in 4x4 units */
789
82.9k
    S32 cu_start_x, cu_start_y;
790
82.9k
    S32 shift = e_cu_size;
791
792
    /* top right and bot left validity at CU level */
793
82.9k
    S32 cu_tr_valid, cu_bl_valid;
794
    /* strideo f the grid */
795
82.9k
    S32 grid_stride = ps_mv_grid->i4_stride;
796
797
82.9k
    ps_pred_ctxt->lambda = lambda;
798
82.9k
    ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799
82.9k
    ps_pred_ctxt->pred_lx = pred_lx;
800
82.9k
    ps_pred_ctxt->mv_pel = 0;
801
82.9k
    ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802
82.9k
    ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803
82.9k
    ps_pred_ctxt->proj_used = 1;
804
805
82.9k
    cu_start_x = ps_search_results->u1_x_off >> 2;
806
82.9k
    cu_start_y = ps_search_results->u1_y_off >> 2;
807
808
    /* Coloc always points to fixed global candt */
809
82.9k
    ps_coloc = ps_coloc_candts;
810
811
    /* Go to base of the CU in the MV Grid */
812
82.9k
    ps_grid_cu_base = &ps_mv_grid->as_node[0];
813
82.9k
    ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814
82.9k
    ps_grid_cu_base += (grid_stride * cu_start_y);
815
816
    /* points to the real bottom left of the grid, will never be valid */
817
82.9k
    ps_invalid = &ps_mv_grid->as_node[0];
818
82.9k
    ps_invalid += (grid_stride * 17);
819
820
82.9k
    {
821
82.9k
        S32 shift = 1 + e_cu_size;
822
82.9k
        cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823
82.9k
        cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824
82.9k
    }
825
826
    /*************************************************************************/
827
    /* for the case of    encode, the idea is to set up cants as follows     */
828
    /*                                                                       */
829
    /*    ____ ______________ ____ ____                                      */
830
    /*   | T0 | T1 | T2 | T3 | T4 | T5 |                                     */
831
    /*   |____|____|____|____|____|____|                                     */
832
    /*   | L1 |    |              |                                          */
833
    /*   |____|    |              |                                          */
834
    /*   | L2 | p0 |     p1       |                                          */
835
    /*   |____|    |              |                                          */
836
    /*   | L3 |    |              |                                          */
837
    /*   |____|    |              |                                          */
838
    /*   | L4 | L' |              |                                          */
839
    /*   |____|____|______________|                                          */
840
    /*   | BL |                                                              */
841
    /*   |____|                                                              */
842
    /*  The example is shown with 16x16 CU, though it can be generalized     */
843
    /*  This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition   */
844
    /*  width and ht in 4x4 units.                                           */
845
    /*  For a given CU, derive the top left, top and bottom left and top rt  */
846
    /*  pts. Left and top are assumed to be valid.                           */
847
    /*  IF there aretwo partitions in the CU (like p0 and p1) and vertical,  */
848
    /*  then for first partition, left, top, top left and top right valid    */
849
    /*  Bottom left is valid. store these validity flags. Also store the     */
850
    /*  grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851
    /*  Left grid offset = -1, 3. Top Grd offset = -1, 0.                    */
852
    /*  Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4.        */
853
    /*  For p1, validity flags are left, top, top left, top right, valid.    */
854
    /*  BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2)    */
855
    /*  TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care.                   */
856
    /*  For p1, set the left pred candt to the best search result of p0.     */
857
    /*************************************************************************/
858
859
    /* Loop over all partitions, and identify the 5 neighbours */
860
1.49M
    for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861
1.41M
    {
862
1.41M
        part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863
1.41M
        S32 tr_valid, bl_valid, is_vert;
864
1.41M
        search_node_t *ps_grid_pu_base;
865
1.41M
        PART_TYPE_T e_part_type;
866
1.41M
        PART_ID_T first_part;
867
1.41M
        S32 part_num;
868
869
1.41M
        e_part_type = ge_part_id_to_part_type[part_id];
870
1.41M
        first_part = ge_part_type_to_part_id[e_part_type][0];
871
1.41M
        is_vert = gau1_is_vert_part[e_part_type];
872
1.41M
        part_num = gau1_part_id_to_part_num[part_id];
873
1.41M
        tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874
1.41M
        bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875
876
1.41M
        part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877
1.41M
        part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878
1.41M
        part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879
1.41M
        part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880
881
        /* go to top left of part */
882
1.41M
        ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883
1.41M
        ps_grid_pu_base += (part_start_y * grid_stride);
884
885
1.41M
        ps_tl = ps_grid_pu_base - 1 - grid_stride;
886
1.41M
        ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887
1.41M
        ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888
1.41M
        ps_tr = ps_t + 1;
889
1.41M
        ps_bl = ps_l + grid_stride;
890
891
1.41M
        if(!tr_valid)
892
640k
            ps_tr = ps_invalid;
893
1.41M
        if(!bl_valid)
894
1.03M
            ps_bl = ps_invalid;
895
896
1.41M
        if(part_num == 1)
897
580k
        {
898
            /* for cases of two partitions 2nd part has 1st part as candt */
899
            /* if vertical type, left candt of 2nd part is 1st part.      */
900
            /* if horz type, top candt of 2nd part is 1st part.           */
901
580k
            if(is_vert)
902
331k
            {
903
331k
                ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904
331k
            }
905
248k
            else
906
248k
            {
907
248k
                ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908
248k
            }
909
580k
        }
910
1.41M
        if(part_num == 2)
911
82.9k
        {
912
            /* only possible for NxN_BL */
913
82.9k
            ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914
82.9k
            ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915
82.9k
        }
916
1.41M
        if(part_num == 3)
917
82.9k
        {
918
            /* only possible for NxN_BR */
919
82.9k
            ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920
82.9k
            ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921
82.9k
            ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922
82.9k
        }
923
1.41M
        hme_init_pred_part(
924
1.41M
            ps_pred_ctxt,
925
1.41M
            ps_tl,
926
1.41M
            ps_t,
927
1.41M
            ps_tr,
928
1.41M
            ps_l,
929
1.41M
            ps_bl,
930
1.41M
            ps_coloc,
931
1.41M
            ps_zeromv_candt,
932
1.41M
            NULL,
933
1.41M
            (PART_ID_T)part_id);
934
1.41M
    }
935
82.9k
}
936
937
/**
938
********************************************************************************
939
*  @fn     compute_mv_cost_explicit(search_node_t *ps_node,
940
*                   pred_ctxt_t *ps_pred_ctxt,
941
*                   PART_ID_T e_part_id)
942
*
943
*  @brief  MV cost for explicit search in layers not encoded
944
*
945
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
946
*
947
*  @param[in]  ps_pred_ctxt : mv pred context
948
*
949
*  @param[in]  e_part_id : Partition id.
950
*
951
*  @return   Cost value
952
953
********************************************************************************
954
*/
955
S32 compute_mv_cost_explicit(
956
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957
2.25M
{
958
2.25M
#define RETURN_FIXED_COST 0
959
2.25M
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960
2.25M
    pred_candt_nodes_t *ps_pred_nodes;
961
2.25M
    S32 inp_shift = 2 - inp_mv_pel;
962
2.25M
    S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963
2.25M
    S32 mv_p_x, mv_p_y;
964
2.25M
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
965
2.25M
    S32 cost, ref_bits;
966
967
    /*************************************************************************/
968
    /* Logic for cost computation for explicit search. For such a search,    */
969
    /* it is guaranteed that all predictor candts have same ref id. The only */
970
    /* probable issue is with the availability which needs checking. This fxn*/
971
    /* does not suffer the need to scale predictor candts due to diff ref id */
972
    /*************************************************************************/
973
974
    /* Hack: currently we always assume 2Nx2N. */
975
    /* TODO: get rid of this hack and return cost tuned to each partition */
976
2.25M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977
2.25M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978
979
    /*************************************************************************/
980
    /* Priority to bottom left availability. Else we go to left. If both are */
981
    /* not available, then a remains null                                    */
982
    /*************************************************************************/
983
2.25M
    if(ps_pred_nodes->ps_tl->u1_is_avail)
984
1.43M
        ps_pred_node_a = ps_pred_nodes->ps_tl;
985
823k
    else if(ps_pred_nodes->ps_l->u1_is_avail)
986
352k
        ps_pred_node_a = ps_pred_nodes->ps_l;
987
988
    /*************************************************************************/
989
    /* For encoder, top left may not be really needed unless we use slices,  */
990
    /* and even then in ME it may not be relevant. So we only consider T or  */
991
    /* TR, as, if both T and TR are not available, TL also will not be       */
992
    /*************************************************************************/
993
2.25M
    if(ps_pred_nodes->ps_tr->u1_is_avail)
994
1.43M
        ps_pred_node_b = ps_pred_nodes->ps_tr;
995
827k
    else if(ps_pred_nodes->ps_t->u1_is_avail)
996
383k
        ps_pred_node_b = ps_pred_nodes->ps_t;
997
998
2.25M
    if(ps_pred_node_a == NULL)
999
471k
    {
1000
471k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001
471k
        if(ps_pred_node_b == NULL)
1002
92.1k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003
471k
    }
1004
1.78M
    else if(ps_pred_node_b == NULL)
1005
352k
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006
1.43M
    else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007
751k
    {
1008
751k
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009
751k
    }
1010
1011
2.25M
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012
2.25M
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013
2.25M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014
2.25M
    mvdx1 = ABS(mvdx1);
1015
2.25M
    mvdy1 = ABS(mvdy1);
1016
1017
2.25M
    mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018
2.25M
    mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019
2.25M
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020
2.25M
    mvdx2 = ABS(mvdx2);
1021
2.25M
    mvdy2 = ABS(mvdy2);
1022
1023
2.25M
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024
745k
    {
1025
745k
        cost =
1026
745k
            hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027
745k
    }
1028
1.51M
    else
1029
1.51M
    {
1030
1.51M
        cost =
1031
1.51M
            hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032
1.51M
    }
1033
2.25M
    {
1034
2.25M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035
2.25M
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036
2.25M
    }
1037
2.25M
}
1038
/**
1039
********************************************************************************
1040
*  @fn     compute_mv_cost_coarse(search_node_t *ps_node,
1041
*                   pred_ctxt_t *ps_pred_ctxt,
1042
*                   PART_ID_T e_part_id)
1043
*
1044
*  @brief  MV cost for coarse explicit search in coarsest layer
1045
*
1046
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1047
*
1048
*  @param[in]  ps_pred_ctxt : mv pred context
1049
*
1050
*  @param[in]  e_part_id : Partition id.
1051
*
1052
*  @return   Cost value
1053
1054
********************************************************************************
1055
*/
1056
S32 compute_mv_cost_coarse(
1057
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058
2.25M
{
1059
2.25M
    ARG_NOT_USED(e_part_id);
1060
1061
2.25M
    return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062
2.25M
}
1063
1064
/**
1065
********************************************************************************
1066
*  @fn     compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067
*                                            pred_ctxt_t *ps_pred_ctxt,
1068
*                                            PART_ID_T e_part_id)
1069
*
1070
*  @brief  MV cost for coarse explicit search in coarsest layer
1071
*
1072
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1073
*
1074
*  @param[in]  ps_pred_ctxt : mv pred context
1075
*
1076
*  @param[in]  e_part_id : Partition id.
1077
*
1078
*  @return   Cost value
1079
1080
********************************************************************************
1081
*/
1082
S32 compute_mv_cost_coarse_high_speed(
1083
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084
27.7M
{
1085
27.7M
    S32 rnd, mvx, mvy, i4_search_idx;
1086
27.7M
    S32 cost;
1087
1088
27.7M
    mvx = ps_node->s_mv.i2_mvx;
1089
27.7M
    mvy = ps_node->s_mv.i2_mvy;
1090
27.7M
    i4_search_idx = ps_node->i1_ref_idx;
1091
1092
27.7M
    cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093
27.7M
    cost += (mvx != 0) ? 1 : 0;
1094
27.7M
    cost += (mvy != 0) ? 1 : 0;
1095
27.7M
    rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096
27.7M
    cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097
27.7M
    return cost;
1098
27.7M
}
1099
1100
/**
1101
********************************************************************************
1102
*  @fn     compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103
*                                          pred_ctxt_t *ps_pred_ctxt,
1104
*                                          PART_ID_T e_part_id)
1105
*
1106
*  @brief  MV cost for explicit search in layers not encoded. Always returns
1107
*          cost of the projected colocated candidate
1108
*
1109
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1110
*
1111
*  @param[in]  ps_pred_ctxt : mv pred context
1112
*
1113
*  @param[in]  e_part_id : Partition id.
1114
*
1115
*  @return   Cost value
1116
1117
********************************************************************************
1118
*/
1119
S32 compute_mv_cost_explicit_refine(
1120
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121
10.0M
{
1122
10.0M
    search_node_t *ps_pred_node_a = NULL;
1123
10.0M
    pred_candt_nodes_t *ps_pred_nodes;
1124
10.0M
    S32 inp_shift = 2 - inp_mv_pel;
1125
10.0M
    S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126
10.0M
    S32 mv_p_x, mv_p_y;
1127
10.0M
    S16 mvdx1, mvdy1;
1128
10.0M
    S32 cost, ref_bits;
1129
1130
10.0M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131
10.0M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132
1133
10.0M
    ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134
1135
10.0M
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136
10.0M
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137
10.0M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138
10.0M
    mvdx1 = ABS(mvdx1);
1139
10.0M
    mvdy1 = ABS(mvdy1);
1140
1141
10.0M
    cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142
1143
10.0M
    {
1144
10.0M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145
10.0M
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146
10.0M
    }
1147
10.0M
}
1148
1149
/**
1150
********************************************************************************
1151
*  @fn     compute_mv_cost_refine(search_node_t *ps_node,
1152
*                   pred_ctxt_t *ps_pred_ctxt,
1153
*                   PART_ID_T e_part_id)
1154
*
1155
*  @brief  MV cost for coarse explicit search in coarsest layer
1156
*
1157
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1158
*
1159
*  @param[in]  ps_pred_ctxt : mv pred context
1160
*
1161
*  @param[in]  e_part_id : Partition id.
1162
*
1163
*  @return   Cost value
1164
1165
********************************************************************************
1166
*/
1167
S32 compute_mv_cost_refine(
1168
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169
10.0M
{
1170
10.0M
    return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171
10.0M
}
1172
1173
S32 compute_mv_cost_implicit(
1174
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175
0
{
1176
0
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177
0
    pred_candt_nodes_t *ps_pred_nodes;
1178
0
    S08 i1_ref_idx;
1179
0
    S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180
0
    S08 i1_ref_bl = -1, i1_ref_l = -1;
1181
0
    S32 inp_shift = 2 - inp_mv_pel;
1182
0
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183
0
    S32 ref_bits, cost;
1184
0
    S32 mv_p_x, mv_p_y;
1185
0
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186
1187
    //return 0;
1188
0
    i1_ref_idx = ps_node->i1_ref_idx;
1189
1190
    /*************************************************************************/
1191
    /* Logic for cost computation for explicit search. For such a search,    */
1192
    /* it is guaranteed that all predictor candts have same ref id. The only */
1193
    /* probable issue is with the availability which needs checking. This fxn*/
1194
    /* does not suffer the need to scale predictor candts due to diff ref id */
1195
    /*************************************************************************/
1196
1197
0
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198
0
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199
1200
    /*************************************************************************/
1201
    /* Priority to bottom left availability. Else we go to left. If both are */
1202
    /* not available, then a remains null                                    */
1203
    /*************************************************************************/
1204
0
    if(ps_pred_nodes->ps_bl->u1_is_avail)
1205
0
        i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206
0
    if(ps_pred_nodes->ps_l->u1_is_avail)
1207
0
        i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208
0
    if(i1_ref_bl == i1_ref_idx)
1209
0
        ps_pred_node_a = ps_pred_nodes->ps_bl;
1210
0
    else if(i1_ref_l == i1_ref_idx)
1211
0
        ps_pred_node_a = ps_pred_nodes->ps_l;
1212
0
    if(ps_pred_node_a == NULL)
1213
0
    {
1214
0
        if(i1_ref_bl != -1)
1215
0
            ps_pred_node_a = ps_pred_nodes->ps_bl;
1216
0
        else if(i1_ref_l != -1)
1217
0
            ps_pred_node_a = ps_pred_nodes->ps_l;
1218
0
    }
1219
1220
    /*************************************************************************/
1221
    /* For encoder, top left may not be really needed unless we use slices,  */
1222
    /* and even then in ME it may not be relevant. So we only consider T or  */
1223
    /* TR, as, if both T and TR are not available, TL also will not be       */
1224
    /*************************************************************************/
1225
0
    if(ps_pred_nodes->ps_tr->u1_is_avail)
1226
0
        i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227
0
    if(ps_pred_nodes->ps_t->u1_is_avail)
1228
0
        i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229
0
    if(ps_pred_nodes->ps_tl->u1_is_avail)
1230
0
        i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231
0
    if(i1_ref_tr == i1_ref_idx)
1232
0
        ps_pred_node_b = ps_pred_nodes->ps_tr;
1233
0
    else if(i1_ref_t == i1_ref_idx)
1234
0
        ps_pred_node_b = ps_pred_nodes->ps_t;
1235
0
    else if(i1_ref_tl == i1_ref_idx)
1236
0
        ps_pred_node_b = ps_pred_nodes->ps_tl;
1237
1238
0
    if(ps_pred_node_b == NULL)
1239
0
    {
1240
0
        if(i1_ref_tr != -1)
1241
0
            ps_pred_node_b = ps_pred_nodes->ps_tr;
1242
0
        else if(i1_ref_t != -1)
1243
0
            ps_pred_node_b = ps_pred_nodes->ps_t;
1244
0
        else if(i1_ref_tl != -1)
1245
0
            ps_pred_node_b = ps_pred_nodes->ps_tl;
1246
0
    }
1247
0
    if(ps_pred_node_a == NULL)
1248
0
    {
1249
0
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250
0
        if(ps_pred_node_b == NULL)
1251
0
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252
0
    }
1253
0
    else if(ps_pred_node_b == NULL)
1254
0
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255
0
    else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256
0
    {
1257
0
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258
0
    }
1259
1260
0
    if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261
0
    {
1262
0
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263
0
    }
1264
0
    else
1265
0
    {
1266
0
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267
0
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268
0
    }
1269
0
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270
0
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271
0
    mvdx1 = ABS(mvdx1);
1272
0
    mvdy1 = ABS(mvdy1);
1273
1274
0
    if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275
0
    {
1276
0
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277
0
    }
1278
0
    else
1279
0
    {
1280
0
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281
0
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282
0
    }
1283
0
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284
0
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285
0
    mvdx2 = ABS(mvdx2);
1286
0
    mvdy2 = ABS(mvdy2);
1287
1288
0
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289
0
    {
1290
0
        cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291
0
               2 * (mvdy1 > 0) + ref_bits + 2;
1292
0
    }
1293
0
    else
1294
0
    {
1295
0
        cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296
0
               2 * (mvdy2 > 0) + ref_bits + 2;
1297
0
    }
1298
0
    {
1299
        /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300
0
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301
0
        S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302
1303
0
        tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304
0
        return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305
0
    }
1306
0
}
1307
1308
S32 compute_mv_cost_implicit_high_speed(
1309
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310
145k
{
1311
145k
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312
145k
    pred_candt_nodes_t *ps_pred_nodes;
1313
145k
    S08 i1_ref_idx;
1314
145k
    S08 i1_ref_tr = -1;
1315
145k
    S08 i1_ref_l = -1;
1316
145k
    S32 inp_shift = 2 - inp_mv_pel;
1317
145k
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318
145k
    S32 ref_bits, cost;
1319
145k
    S32 mv_p_x, mv_p_y;
1320
145k
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321
1322
145k
    i1_ref_idx = ps_node->i1_ref_idx;
1323
1324
145k
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325
145k
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326
1327
    /*************************************************************************/
1328
    /* Priority to bottom left availability. Else we go to left. If both are */
1329
    /* not available, then a remains null                                    */
1330
    /*************************************************************************/
1331
145k
    if(ps_pred_nodes->ps_l->u1_is_avail)
1332
117k
    {
1333
117k
        i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334
117k
        ps_pred_node_a = ps_pred_nodes->ps_l;
1335
117k
    }
1336
1337
    /*************************************************************************/
1338
    /* For encoder, top left may not be really needed unless we use slices,  */
1339
    /* and even then in ME it may not be relevant. So we only consider T or  */
1340
    /* TR, as, if both T and TR are not available, TL also will not be       */
1341
    /*************************************************************************/
1342
1343
145k
    if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344
25.7k
    {
1345
25.7k
        i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346
25.7k
        ps_pred_node_b = ps_pred_nodes->ps_tr;
1347
25.7k
    }
1348
119k
    else
1349
119k
    {
1350
119k
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351
119k
    }
1352
1353
145k
    if(ps_pred_node_a == NULL)
1354
28.5k
    {
1355
28.5k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356
1357
28.5k
        if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358
19.5k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359
28.5k
    }
1360
1361
145k
    if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362
17.8k
    {
1363
17.8k
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364
17.8k
    }
1365
127k
    else
1366
127k
    {
1367
127k
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368
127k
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369
127k
    }
1370
1371
145k
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372
145k
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373
145k
    mvdx1 = ABS(mvdx1);
1374
145k
    mvdy1 = ABS(mvdy1);
1375
1376
145k
    if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377
27.3k
    {
1378
27.3k
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379
27.3k
    }
1380
118k
    else
1381
118k
    {
1382
118k
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383
118k
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384
118k
    }
1385
1386
145k
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387
145k
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388
145k
    mvdx2 = ABS(mvdx2);
1389
145k
    mvdy2 = ABS(mvdy2);
1390
1391
145k
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392
46.8k
    {
1393
46.8k
        cost =
1394
46.8k
            hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395
46.8k
    }
1396
98.8k
    else
1397
98.8k
    {
1398
98.8k
        cost =
1399
98.8k
            hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400
98.8k
    }
1401
145k
    {
1402
        /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403
145k
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404
145k
        S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405
1406
145k
        return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407
145k
    }
1408
145k
}
1409
1410
S32 compute_mv_cost_implicit_high_speed_modified(
1411
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412
0
{
1413
0
    search_node_t *ps_pred_node_a = NULL;
1414
0
    pred_candt_nodes_t *ps_pred_nodes;
1415
0
    S32 inp_shift = 2 - inp_mv_pel;
1416
0
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417
0
    S32 mv_p_x, mv_p_y;
1418
0
    S16 mvdx1, mvdy1;
1419
0
    S32 cost, ref_bits;
1420
1421
0
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422
0
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423
1424
0
    ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425
1426
0
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427
0
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428
0
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429
0
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430
0
    mvdx1 = ABS(mvdx1);
1431
0
    mvdy1 = ABS(mvdy1);
1432
1433
0
    cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434
1435
0
    {
1436
0
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437
0
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438
0
    }
1439
0
}
1440
1441
void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442
43.7k
{
1443
    /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444
1445
43.7k
    search_node_t s_search_node_grid;
1446
43.7k
    const search_node_t *ps_search_node_base;
1447
43.7k
    search_node_t *ps_search_node_grid, *ps_best_node;
1448
43.7k
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449
43.7k
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450
43.7k
    search_results_t *ps_search_results;
1451
43.7k
    S32 *pi4_valid_part_ids;
1452
43.7k
    S32 i4_step = ps_result_prms->i4_step;
1453
43.7k
    S32 i4_grid_mask, i, i4_min_id;
1454
43.7k
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455
43.7k
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456
43.7k
    S32 grid_count = 0;
1457
43.7k
    S32 pred_lx;
1458
1459
43.7k
    i4_min_id = (S32)PT_C;
1460
43.7k
    i4_min_cost = MAX_32BIT_VAL;
1461
43.7k
    ps_search_node_grid = &s_search_node_grid;
1462
43.7k
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1463
43.7k
    *ps_search_node_grid = *ps_search_node_base;
1464
43.7k
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465
43.7k
    ps_search_results = ps_result_prms->ps_search_results;
1466
43.7k
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1467
43.7k
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1468
1469
437k
    for(i = 0; i < 9; i++)
1470
393k
    {
1471
393k
        if(i4_grid_mask & (1 << i))
1472
360k
            grid_count++;
1473
393k
    }
1474
1475
    /* Some basic assumptions: only single pt, only part updates */
1476
    /* and more than 1 best result to be computed.               */
1477
    //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478
    //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479
    //ASSERT(ps_search_results->num_results > 1);
1480
1481
43.7k
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482
43.7k
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483
1484
    /*************************************************************************/
1485
    /* Supposing we do hte result update for a unique partid, we can */
1486
    /* store the best pt id in the grid and also min cost is return */
1487
    /* param. This will be useful for early exit cases.             */
1488
    /* TODO : once we have separate fxn for unique part+grid, we can */
1489
    /* do away with this code here                                   */
1490
    /*************************************************************************/
1491
    //if (pi4_valid_part_ids[1] == -1)
1492
43.7k
    i4_unique_id = pi4_valid_part_ids[0];
1493
1494
    /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495
    /* this till we encounter -1. This is easier than having to       */
1496
    /* figure out part by part, besides, active part decision is      */
1497
    /* usually fixed for a given duration of search, e.g. entire fpel */
1498
    /* refinement for a blk/cu will use fixed valid part mask         */
1499
43.7k
    id = pi4_valid_part_ids[0];
1500
1501
    /*****************************************************************/
1502
    /* points to the best search results corresponding to this       */
1503
    /* specific part type.                                           */
1504
    /*****************************************************************/
1505
43.7k
    ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506
1507
    /*************************************************************************/
1508
    /* Outer loop runs through all active pts in the grid                    */
1509
    /*************************************************************************/
1510
437k
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511
393k
    {
1512
393k
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513
33.1k
            continue;
1514
1515
        /* For the pt in the grid, update mvx and y depending on */
1516
        /* location of pt. Updates are in FPEL units.            */
1517
360k
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518
360k
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519
360k
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520
360k
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521
1522
360k
        {
1523
            /* evaluate mv cost and totalcost for this part for this given mv*/
1524
360k
            i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525
360k
                ps_search_node_grid,
1526
360k
                &ps_search_results->as_pred_ctxt[pred_lx],
1527
360k
                (PART_ID_T)id,
1528
360k
                MV_RES_FPEL);
1529
1530
360k
            i4_sad = pi4_sad_grid[grid_count * id];
1531
360k
            i4_tot_cost = i4_sad + i4_mv_cost;
1532
1533
360k
            ASSERT(i4_unique_id == id);
1534
360k
            ASSERT(num_results == 1);
1535
1536
            /*****************************************************************/
1537
            /* We do not labor through the results if the total cost worse   */
1538
            /* than the last of the results.                                 */
1539
            /*****************************************************************/
1540
360k
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541
31.7k
            {
1542
31.7k
                i4_min_id = i4_grid_pt;
1543
31.7k
                ps_result_prms->i4_min_cost = i4_tot_cost;
1544
1545
31.7k
                ps_best_node[0] = *ps_search_node_grid;
1546
31.7k
                ps_best_node[0].i4_sad = i4_sad;
1547
31.7k
                ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548
31.7k
                ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549
31.7k
            }
1550
360k
        }
1551
360k
        pi4_sad_grid++;
1552
360k
    }
1553
43.7k
    ps_result_prms->i4_min_id = i4_min_id;
1554
43.7k
}
1555
1556
void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557
1.54M
{
1558
1.54M
    search_node_t s_search_node_grid;
1559
1.54M
    const search_node_t *ps_search_node_base;
1560
1.54M
    search_node_t *ps_search_node_grid, *ps_best_node;
1561
1.54M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562
1.54M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563
1.54M
    search_results_t *ps_search_results;
1564
1.54M
    S32 *pi4_valid_part_ids;
1565
1.54M
    S32 i4_step = ps_result_prms->i4_step;
1566
1.54M
    S32 i4_grid_mask, i4_count, i, i4_min_id;
1567
1.54M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568
1.54M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569
1.54M
    S32 grid_count = 0;
1570
1.54M
    S32 pred_lx;
1571
1572
1.54M
    i4_min_id = (S32)PT_C;
1573
1.54M
    i4_min_cost = MAX_32BIT_VAL;
1574
1.54M
    ps_search_node_grid = &s_search_node_grid;
1575
1.54M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1576
1.54M
    *ps_search_node_grid = *ps_search_node_base;
1577
1.54M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578
1.54M
    ps_search_results = ps_result_prms->ps_search_results;
1579
1.54M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1580
1.54M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1581
1582
15.4M
    for(i = 0; i < 9; i++)
1583
13.9M
    {
1584
13.9M
        if(i4_grid_mask & (1 << i))
1585
7.43M
        {
1586
7.43M
            grid_count++;
1587
7.43M
        }
1588
13.9M
    }
1589
1590
1.54M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591
1.54M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592
1593
1.54M
    i4_unique_id = pi4_valid_part_ids[0];
1594
1595
    /*************************************************************************/
1596
    /* Outer loop runs through all active pts in the grid                    */
1597
    /*************************************************************************/
1598
15.4M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599
13.9M
    {
1600
13.9M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601
6.48M
        {
1602
6.48M
            continue;
1603
6.48M
        }
1604
1605
        /* For the pt in the grid, update mvx and y depending on */
1606
        /* location of pt. Updates are in FPEL units.            */
1607
7.43M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608
7.43M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609
7.43M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610
7.43M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611
1612
7.43M
        i4_count = 0;
1613
1614
14.8M
        while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615
7.43M
        {
1616
            /*****************************************************************/
1617
            /* points to the best search results corresponding to this       */
1618
            /* specific part type.                                           */
1619
            /*****************************************************************/
1620
7.43M
            ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621
1622
            /* evaluate mv cost and totalcost for this part for this given mv*/
1623
7.43M
            i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624
7.43M
                ps_search_node_grid,
1625
7.43M
                &ps_search_results->as_pred_ctxt[pred_lx],
1626
7.43M
                (PART_ID_T)id,
1627
7.43M
                MV_RES_FPEL);
1628
1629
7.43M
            i4_sad = pi4_sad_grid[grid_count * id];
1630
7.43M
            i4_tot_cost = i4_sad + i4_mv_cost;
1631
1632
7.43M
            if(i4_unique_id == id)
1633
7.43M
            {
1634
7.43M
                if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635
1.87M
                {
1636
1.87M
                    i4_min_id = i4_grid_pt;
1637
1.87M
                    ps_result_prms->i4_min_cost = i4_tot_cost;
1638
1.87M
                }
1639
7.43M
            }
1640
1641
7.43M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642
478k
            {
1643
478k
                for(i = 0; i < num_results - 1; i++)
1644
0
                {
1645
0
                    if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646
0
                    {
1647
0
                        memmove(
1648
0
                            ps_best_node + i + 1,
1649
0
                            ps_best_node + i,
1650
0
                            sizeof(search_node_t) * (num_results - 1 - i));
1651
0
                        break;
1652
0
                    }
1653
0
                    else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654
0
                    {
1655
0
                        if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656
0
                            break;
1657
0
                    }
1658
0
                }
1659
478k
                ps_best_node[i] = *ps_search_node_grid;
1660
478k
                ps_best_node[i].i4_sad = i4_sad;
1661
478k
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662
478k
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663
478k
            }
1664
7.43M
            i4_count++;
1665
7.43M
        }
1666
7.43M
        pi4_sad_grid++;
1667
7.43M
    }
1668
1.54M
    ps_result_prms->i4_min_id = i4_min_id;
1669
1.54M
}
1670
1671
/**
1672
********************************************************************************
1673
*  @fn     hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674
*
1675
*  @brief  Updates results for the case where 1 best result is to be updated
1676
*          for a given pt, for several parts
1677
*          Note : The function is replicated for CLIPing the cost to 16bit to make
1678
*                  bit match with SIMD version
1679
*
1680
*  @param[in]  result_upd_prms_t : Contains the input parameters to this fxn
1681
*
1682
*  @return   The result_upd_prms_t structure is updated for all the active
1683
*            parts in case the current candt has results for any given part
1684
*             that is the best result for that part
1685
********************************************************************************
1686
*/
1687
void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688
3.60M
{
1689
3.60M
    search_node_t s_search_node_grid;
1690
3.60M
    const search_node_t *ps_search_node_base;
1691
3.60M
    search_node_t *ps_search_node_grid, *ps_best_node;
1692
3.60M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693
3.60M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694
3.60M
    search_results_t *ps_search_results;
1695
3.60M
    S32 *pi4_valid_part_ids;
1696
3.60M
    S32 i4_step = ps_result_prms->i4_step;
1697
3.60M
    S32 i4_grid_mask, i4_count, i, i4_min_id;
1698
3.60M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699
3.60M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700
3.60M
    S32 grid_count = 0;
1701
3.60M
    S32 pred_lx;
1702
1703
3.60M
    i4_min_id = (S32)PT_C;
1704
3.60M
    i4_min_cost = MAX_32BIT_VAL;
1705
3.60M
    ps_search_node_grid = &s_search_node_grid;
1706
3.60M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1707
3.60M
    *ps_search_node_grid = *ps_search_node_base;
1708
3.60M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709
3.60M
    ps_search_results = ps_result_prms->ps_search_results;
1710
3.60M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1711
3.60M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1712
1713
36.0M
    for(i = 0; i < 9; i++)
1714
32.4M
    {
1715
32.4M
        if(i4_grid_mask & (1 << i))
1716
3.60M
            grid_count++;
1717
32.4M
    }
1718
1719
    /* Some basic assumptions: only single pt, only part updates */
1720
    /* and more than 1 best result to be computed.               */
1721
1722
3.60M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723
3.60M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724
1725
    /*************************************************************************/
1726
    /* Supposing we do hte result update for a unique partid, we can */
1727
    /* store the best pt id in the grid and also min cost is return */
1728
    /* param. This will be useful for early exit cases.             */
1729
    /* TODO : once we have separate fxn for unique part+grid, we can */
1730
    /* do away with this code here                                   */
1731
    /*************************************************************************/
1732
    //if (pi4_valid_part_ids[1] == -1)
1733
3.60M
    i4_unique_id = pi4_valid_part_ids[0];
1734
1735
    /*************************************************************************/
1736
    /* Outer loop runs through all active pts in the grid                    */
1737
    /*************************************************************************/
1738
36.0M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739
32.4M
    {
1740
32.4M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741
28.8M
            continue;
1742
1743
        /* For the pt in the grid, update mvx and y depending on */
1744
        /* location of pt. Updates are in FPEL units.            */
1745
3.60M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746
3.60M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747
3.60M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748
3.60M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749
1750
3.60M
        i4_count = 0;
1751
1752
        /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753
        /* this till we encounter -1. This is easier than having to       */
1754
        /* figure out part by part, besides, active part decision is      */
1755
        /* usually fixed for a given duration of search, e.g. entire fpel */
1756
        /* refinement for a blk/cu will use fixed valid part mask         */
1757
1758
13.6M
        while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759
10.0M
        {
1760
            //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761
1762
            /*****************************************************************/
1763
            /* points to the best search results corresponding to this       */
1764
            /* specific part type.                                           */
1765
            /*****************************************************************/
1766
10.0M
            ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767
1768
            /* evaluate mv cost and totalcost for this part for this given mv*/
1769
10.0M
            i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770
10.0M
                ps_search_node_grid,
1771
10.0M
                &ps_search_results->as_pred_ctxt[pred_lx],
1772
10.0M
                (PART_ID_T)id,
1773
10.0M
                MV_RES_FPEL);
1774
1775
10.0M
            i4_sad = pi4_sad_grid[grid_count * id];
1776
1777
            /* Clipping to 16 bit to bit match with SIMD version */
1778
10.0M
            i4_mv_cost = CLIP_S16(i4_mv_cost);
1779
10.0M
            i4_sad = CLIP_S16(i4_sad);
1780
1781
10.0M
            i4_tot_cost = i4_sad + i4_mv_cost;
1782
            /* Clipping to 16 bit to bit match with SIMD version */
1783
10.0M
            i4_tot_cost = CLIP_S16(i4_tot_cost);
1784
1785
10.0M
            if(i4_unique_id == id)
1786
3.60M
            {
1787
3.60M
                if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788
3.60M
                {
1789
3.60M
                    i4_min_id = i4_grid_pt;
1790
3.60M
                    ps_result_prms->i4_min_cost = i4_tot_cost;
1791
3.60M
                }
1792
3.60M
            }
1793
1794
            /*****************************************************************/
1795
            /* We do not labor through the results if the total cost worse   */
1796
            /* than the last of the results.                                 */
1797
            /*****************************************************************/
1798
10.0M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799
2.11M
            {
1800
                /*************************************************************/
1801
                /* Identify where the current result isto be placed.Basically*/
1802
                /* find the node which has cost just higher thannodeundertest*/
1803
                /*************************************************************/
1804
2.59M
                for(i = 0; i < num_results - 1; i++)
1805
1.46M
                {
1806
1.46M
                    if(i4_tot_cost <= ps_best_node[i].i4_tot_cost)
1807
985k
                    {
1808
985k
                        memmove(
1809
985k
                            ps_best_node + i + 1,
1810
985k
                            ps_best_node + i,
1811
985k
                            sizeof(search_node_t) * (num_results - 1 - i));
1812
985k
                        break;
1813
985k
                    }
1814
1.46M
                }
1815
2.11M
                ps_best_node[i] = *ps_search_node_grid;
1816
2.11M
                ps_best_node[i].i4_sad = i4_sad;
1817
2.11M
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1818
2.11M
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1819
2.11M
            }
1820
10.0M
            i4_count++;
1821
10.0M
        }
1822
3.60M
        pi4_sad_grid++;
1823
3.60M
    }
1824
3.60M
    ps_result_prms->i4_min_id = i4_min_id;
1825
3.60M
}
1826
1827
/**
1828
********************************************************************************
1829
*  @fn     hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1830
*
1831
*  @brief  Updates results for the case where 1 best result is to be updated
1832
*          for a given pt, for several parts
1833
*
1834
*  @param[in]  ps_result_prms. Contains the input parameters to this fxn
1835
*              ::ps_pred_info : contains cost fxn ptr and predictor info
1836
*              ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1837
*              ::ps_search_results: Search results structure
1838
*              ::i1_ref_id : Reference index
1839
*              ::i4_grid_mask: Dont Care for this fxn
1840
*              ::pi4_valid_part_ids : valid part ids
1841
*              ::ps_search_node_base: Contains the centre pt candt info.
1842
*
1843
*  @return   The ps_search_results structure is updated for all the active
1844
*            parts in case the current candt has results for any given part
1845
*             that is the best result for that part
1846
********************************************************************************
1847
*/
1848
1849
void hme_update_results_pt_pu_best1_subpel_hs(
1850
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1851
40.0k
{
1852
40.0k
    search_node_t *ps_search_node_base, *ps_best_node;
1853
40.0k
    search_results_t *ps_search_results;
1854
40.0k
    S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1855
40.0k
    S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1856
40.0k
    S32 num_results, i;
1857
40.0k
    S32 *pi4_valid_part_ids;
1858
1859
40.0k
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1860
    /* Some basic assumptions: only single pt, only part updates */
1861
    /* and more than 1 best result to be computed.               */
1862
40.0k
    ASSERT(ps_result_prms->i4_grid_mask == 1);
1863
1864
40.0k
    ps_search_results = ps_result_prms->ps_search_results;
1865
40.0k
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1866
1867
    /* Compute mv cost, total cost */
1868
40.0k
    ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1869
1870
390k
    while((id = pi4_valid_part_ids[i4_count]) >= 0)
1871
350k
    {
1872
350k
        S32 update_required = 1;
1873
1874
350k
        ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1875
        /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1876
350k
        i4_mv_cost = ps_best_node->i4_mv_cost;
1877
350k
        i4_sad = ps_result_prms->pi4_sad_grid[id];
1878
350k
        i4_tot_cost = i4_sad + i4_mv_cost;
1879
1880
        /* We do not labor through the results if the total cost is worse than   */
1881
        /* the last of the results.                                              */
1882
350k
        if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1883
188k
        {
1884
            /* Identify where the current result is to be placed. Basically find  */
1885
            /* the node which has cost just higher than node under test           */
1886
188k
            for(i = 0; i < num_results - 1; i++)
1887
0
            {
1888
0
                if(ps_best_node[i].i1_ref_idx != -1)
1889
0
                {
1890
0
                    if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1891
0
                    {
1892
0
                        memmove(
1893
0
                            ps_best_node + i + 1,
1894
0
                            ps_best_node + i,
1895
0
                            sizeof(search_node_t) * (num_results - 1 - i));
1896
0
                        break;
1897
0
                    }
1898
0
                    else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1899
0
                    {
1900
0
                        update_required = 0;
1901
0
                        break;
1902
0
                    }
1903
0
                }
1904
0
                else
1905
0
                {
1906
0
                    break;
1907
0
                }
1908
0
            }
1909
1910
188k
            if(update_required)
1911
188k
            {
1912
                /* Update when either ref_idx or mv's are different */
1913
188k
                ps_best_node[i] = *ps_search_node_base;
1914
188k
                ps_best_node[i].i4_sad = i4_sad;
1915
188k
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1916
188k
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1917
188k
            }
1918
188k
        }
1919
350k
        i4_count++;
1920
350k
    }
1921
40.0k
}
1922
1923
void hme_update_results_pt_pu_best1_subpel_hs_1(
1924
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1925
0
{
1926
0
    search_node_t *ps_search_node_base, *ps_best_node;
1927
0
    search_results_t *ps_search_results;
1928
0
    S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1929
0
    S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1930
0
    S32 num_results;
1931
0
    S32 *pi4_valid_part_ids;
1932
1933
0
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1934
    /* Some basic assumptions: only single pt, only part updates */
1935
    /* and more than 1 best result to be computed.               */
1936
0
    ASSERT(ps_result_prms->i4_grid_mask == 1);
1937
1938
0
    ps_search_results = ps_result_prms->ps_search_results;
1939
0
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1940
1941
    /* Compute mv cost, total cost */
1942
0
    ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1943
1944
0
    while((id = pi4_valid_part_ids[i4_count]) >= 0)
1945
0
    {
1946
0
        S32 update_required = 0;
1947
1948
0
        ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1949
        /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1950
0
        i4_mv_cost = ps_best_node->i4_mv_cost;
1951
0
        i4_sad = ps_result_prms->pi4_sad_grid[id];
1952
0
        i4_tot_cost = i4_sad + i4_mv_cost;
1953
1954
        /* We do not labor through the results if the total cost is worse than   */
1955
        /* the last of the results.                                              */
1956
0
        if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1957
0
        {
1958
0
            S32 sdi_value = 0;
1959
1960
0
            update_required = 2;
1961
            /* Identify where the current result is to be placed. Basically find  */
1962
            /* the node which has cost just higher than node under test           */
1963
0
            {
1964
0
                if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1965
0
                {
1966
0
                    update_required = 1;
1967
0
                    sdi_value = ps_best_node[0].i4_sad - i4_sad;
1968
0
                }
1969
0
                else if(
1970
0
                    (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1971
0
                    (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1972
0
                    (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1973
0
                {
1974
0
                    update_required = 0;
1975
0
                }
1976
0
            }
1977
0
            if(update_required == 2)
1978
0
            {
1979
0
                subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1980
1981
0
                ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1982
0
                ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1983
0
                ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1984
0
                ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1985
0
                ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
1986
0
            }
1987
0
            else if(update_required == 1)
1988
0
            {
1989
0
                subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1990
1991
0
                ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
1992
0
                    ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
1993
0
                ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
1994
0
                    ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
1995
0
                ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
1996
0
                    ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
1997
0
                ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
1998
0
                    ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
1999
0
                ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2000
0
                    ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2001
2002
0
                ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2003
0
                ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2004
0
                ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2005
0
                ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2006
0
                ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2007
0
            }
2008
0
        }
2009
0
        i4_count++;
2010
0
    }
2011
0
}
2012
2013
/**
2014
******************************************************************************
2015
*  @brief Gives a result fxn ptr for a index [x] where x is as:
2016
*         0 : single pt, no partial updates, 1 best result
2017
*         1 : single pt, no partial updates, N best results
2018
*         2 : single pt,    partial updates, 1 best result
2019
*         3 : single pt,    partial updates, N best results
2020
*         0 : grid     , no partial updates, 1 best result
2021
*         1 : grid     , no partial updates, N best results
2022
*         2 : grid     ,    partial updates, 1 best result
2023
*         3 : grid     ,    partial updates, N best results
2024
******************************************************************************
2025
*/
2026
2027
static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1,   UPD_RES_PT_NPU_BESTN,
2028
                                              UPD_RES_PT_PU_BEST1,    UPD_RES_PT_PU_BESTN,
2029
                                              UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2030
                                              UPD_RES_GRID_PU_BEST1,  UPD_RES_GRID_PU_BESTN };
2031
2032
/**
2033
********************************************************************************
2034
*  @fn     hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2035
*
2036
*  @brief  Obtains the suitable result function that evaluates COST and also
2037
*           computes one or more best results for point/grid, single part or
2038
*           more than one part.
2039
*
2040
*  @param[in]  i4_grid_mask : Mask containing which of 9 grid pts active
2041
*
2042
*  @param[in]  i4_part_mask : Mask containing which of the 17 parts active
2043
*
2044
*  @param[in]  i4_num_results: Number of active results
2045
*
2046
*  @return   Pointer to the appropriate result update function
2047
********************************************************************************
2048
*/
2049
PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2050
788k
{
2051
788k
    S32 i4_is_grid = (i4_grid_mask != 1);
2052
788k
    S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2053
788k
    S32 i4_res_gt1 = (i4_num_results > 1);
2054
788k
    S32 id;
2055
2056
788k
    id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2057
2058
788k
    return (g_pf_result_fxn[id]);
2059
788k
}
2060
2061
void hme_calc_sad_and_2_best_results(
2062
    hme_search_prms_t *ps_search_prms,
2063
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2064
    err_prms_t *ps_err_prms,
2065
    result_upd_prms_t *ps_result_prms,
2066
    U08 **ppu1_ref,
2067
    S32 i4_ref_stride)
2068
0
{
2069
0
    S32 i4_candt;
2070
0
    S32 i4_inp_off;
2071
0
    S32 i4_ref_offset;
2072
0
    S32 i4_num_nodes;
2073
2074
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2075
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2076
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2077
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2078
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2079
2080
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2081
0
    search_node_t *ps_search_node;
2082
2083
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2084
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2085
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2086
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2087
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2088
0
    ps_search_node = ps_search_prms->ps_search_nodes;
2089
2090
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2091
0
    {
2092
        /**********************************************************************/
2093
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2094
        /**********************************************************************/
2095
0
        {
2096
0
            WORD32 b, c, d;
2097
0
            UWORD8 *pu1_cur_ptr;
2098
0
            UWORD8 *pu1_ref_ptr;
2099
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2100
2101
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2102
0
            {
2103
0
                continue;
2104
0
            }
2105
2106
0
            ps_err_prms->pu1_inp =
2107
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2108
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2109
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2110
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2111
2112
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2113
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2114
2115
            /* Loop to compute the SAD's */
2116
0
            {
2117
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2118
0
                for(b = 0; b < NUM_4X4; b++)
2119
0
                {
2120
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2121
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2122
2123
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2124
0
                    {
2125
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2126
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2127
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2128
0
                        {
2129
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2130
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2131
0
                        }
2132
0
                    }
2133
0
                }
2134
2135
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2136
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2137
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2138
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2139
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2140
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2141
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2142
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2143
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2144
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2145
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2146
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2147
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2148
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2149
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2150
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2151
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2152
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2153
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2154
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2155
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2156
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2157
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2158
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2159
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2160
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2161
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2162
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2163
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2164
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2165
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2166
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2167
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2168
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2169
0
            }
2170
0
        }
2171
2172
0
        {
2173
0
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2174
0
            S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2175
0
            S32 best_node_cost;
2176
0
            S32 second_best_node_cost;
2177
2178
0
            {
2179
0
                S16 mvdx1, mvdy1;
2180
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2181
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2182
0
                S32 pred_lx = i4_search_idx;
2183
2184
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2185
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2186
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2187
2188
0
                S32 inp_shift = 2;
2189
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2190
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2191
0
                S32 lambda = ps_pred_ctxt->lambda;
2192
0
                S32 rnd = 1 << (lambda_q_shift - 1);
2193
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2194
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2195
0
                S32 ref_bits =
2196
0
                    ps_pred_ctxt
2197
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2198
2199
0
                COMPUTE_DIFF_MV(
2200
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2201
2202
0
                mvdx1 = ABS(mvdx1);
2203
0
                mvdy1 = ABS(mvdy1);
2204
2205
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2206
0
                             (mvdy1 > 0) + ref_bits + 2;
2207
2208
0
                i4_mv_cost *= lambda;
2209
0
                i4_mv_cost += rnd;
2210
0
                i4_mv_cost >>= lambda_q_shift;
2211
2212
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2213
0
            }
2214
2215
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2216
            best candidates for that partition*/
2217
2218
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2219
0
            {
2220
0
                S32 update_required = 0;
2221
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2222
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2223
2224
                /*Calculate total cost*/
2225
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2226
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2227
2228
                /*****************************************************************/
2229
                /* We do not labor through the results if the total cost worse   */
2230
                /* than the last of the results.                                 */
2231
                /*****************************************************************/
2232
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2233
0
                second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2234
2235
0
                if(i4_tot_cost < second_best_node_cost)
2236
0
                {
2237
0
                    update_required = 2;
2238
2239
                    /*************************************************************/
2240
                    /* Identify where the current result isto be placed.Basically*/
2241
                    /* find the node which has cost just higher thannodeundertest*/
2242
                    /*************************************************************/
2243
0
                    if(i4_tot_cost < best_node_cost)
2244
0
                    {
2245
0
                        update_required = 1;
2246
0
                    }
2247
0
                    else if(i4_tot_cost == best_node_cost)
2248
0
                    {
2249
0
                        update_required = 0;
2250
0
                    }
2251
2252
0
                    if(update_required == 2)
2253
0
                    {
2254
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2255
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2256
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2257
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2258
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2259
0
                    }
2260
0
                    else if(update_required == 1)
2261
0
                    {
2262
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2263
0
                            ps_mv_refine_ctxt->i2_tot_cost[0][index];
2264
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2265
0
                            ps_mv_refine_ctxt->i2_mv_cost[0][index];
2266
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2267
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2268
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2269
0
                            ps_mv_refine_ctxt->i2_ref_idx[0][index];
2270
2271
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2272
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2273
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2274
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2275
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2276
0
                    }
2277
0
                }
2278
0
            }
2279
0
        }
2280
0
        ps_search_node++;
2281
0
    }
2282
2283
0
    {
2284
0
        WORD32 i4_i;
2285
0
        WORD32 part_id;
2286
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2287
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2288
0
        {
2289
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2290
0
            if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2291
0
            {
2292
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2293
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2294
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2295
2296
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2297
0
            }
2298
0
            if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2299
0
            {
2300
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2301
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2302
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2303
2304
0
                ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2305
0
            }
2306
0
        }
2307
0
    }
2308
0
}
2309
2310
void hme_calc_sad_and_2_best_results_subpel(
2311
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2312
0
{
2313
0
    S32 i4_candt;
2314
0
    S32 i4_num_nodes;
2315
2316
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2317
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2318
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2319
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2320
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2321
2322
0
    mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2323
0
    ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2324
0
    i4_num_nodes = 1;
2325
2326
    /* Run through each of the candts in a loop */
2327
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2328
0
    {
2329
        /**********************************************************************/
2330
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2331
        /**********************************************************************/
2332
0
        {
2333
0
            WORD32 b, c, d;
2334
0
            UWORD8 *pu1_cur_ptr;
2335
0
            UWORD8 *pu1_ref_ptr;
2336
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2337
2338
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2339
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2340
2341
            /* Loop to compute the SAD's */
2342
0
            {
2343
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2344
0
                for(b = 0; b < NUM_4X4; b++)
2345
0
                {
2346
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2347
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2348
2349
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2350
0
                    {
2351
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2352
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2353
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2354
0
                        {
2355
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2356
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2357
0
                        }
2358
0
                    }
2359
0
                }
2360
2361
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2362
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2363
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2364
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2365
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2366
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2367
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2368
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2369
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2370
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2371
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2372
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2373
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2374
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2375
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2376
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2377
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2378
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2379
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2380
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2381
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2382
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2383
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2384
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2385
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2386
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2387
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2388
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2389
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2390
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2391
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2392
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2393
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2394
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2395
0
            }
2396
0
        }
2397
        /**********************************************************************/
2398
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
2399
        /**********************************************************************/
2400
0
        {
2401
0
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2402
0
            S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2403
0
            S32 best_node_cost;
2404
0
            S32 second_best_node_cost;
2405
2406
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2407
            best candidates for that partition*/
2408
2409
0
            for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2410
0
            {
2411
0
                S32 update_required = 0;
2412
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2413
0
                S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2414
2415
                /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2416
0
                i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2417
2418
                /*Calculate total cost*/
2419
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2420
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2421
2422
                /*****************************************************************/
2423
                /* We do not labor through the results if the total cost worse   */
2424
                /* than the last of the results.                                 */
2425
                /*****************************************************************/
2426
0
                best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2427
0
                second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2428
2429
0
                if(i4_tot_cost < second_best_node_cost)
2430
0
                {
2431
0
                    update_required = 2;
2432
2433
                    /*************************************************************/
2434
                    /* Identify where the current result isto be placed.Basically*/
2435
                    /* find the node which has cost just higher thannodeundertest*/
2436
                    /*************************************************************/
2437
0
                    if(i4_tot_cost < best_node_cost)
2438
0
                    {
2439
0
                        update_required = 1;
2440
0
                    }
2441
0
                    else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2442
0
                    {
2443
0
                        update_required = 0;
2444
0
                    }
2445
0
                    if(update_required == 2)
2446
0
                    {
2447
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2448
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2449
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2450
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2451
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2452
0
                    }
2453
0
                    else if(update_required == 1)
2454
0
                    {
2455
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2456
0
                            ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2457
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2458
0
                            ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2459
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2460
0
                            ps_subpel_refine_ctxt->i2_mv_x[0][index];
2461
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2462
0
                            ps_subpel_refine_ctxt->i2_mv_y[0][index];
2463
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2464
0
                            ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2465
2466
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2467
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2468
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2469
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2470
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2471
0
                    }
2472
0
                }
2473
0
            }
2474
0
        }
2475
0
    }
2476
2477
0
    {
2478
0
        WORD32 i4_count = 0;
2479
0
        for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2480
0
        {
2481
0
            WORD32 j;
2482
0
            for(j = 0; j < 2; j++)
2483
0
            {
2484
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2485
0
                {
2486
0
                    ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2487
0
                }
2488
0
            }
2489
0
        }
2490
0
    }
2491
0
}
2492
2493
void hme_calc_stim_injected_sad_and_2_best_results(
2494
    hme_search_prms_t *ps_search_prms,
2495
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2496
    err_prms_t *ps_err_prms,
2497
    result_upd_prms_t *ps_result_prms,
2498
    U08 **ppu1_ref,
2499
    S32 i4_ref_stride)
2500
0
{
2501
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2502
0
    search_node_t *ps_search_node;
2503
2504
0
    S32 i4_candt;
2505
0
    S32 i4_count;
2506
0
    S32 i4_inp_off;
2507
0
    S32 i4_ref_offset;
2508
0
    S32 i4_num_nodes;
2509
0
    ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2510
0
        au8_final_ref_sigmaXSquared[17];
2511
0
    UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2512
0
    S32 *pi4_valid_part_ids;
2513
2514
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2515
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2516
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2517
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2518
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2519
2520
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2521
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2522
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2523
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2524
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2525
0
    ps_search_node = ps_search_prms->ps_search_nodes;
2526
0
    pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2527
2528
    /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2529
0
    au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2530
0
    au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2531
2532
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2533
0
    {
2534
0
        {
2535
0
            WORD32 b, c, d;
2536
0
            UWORD8 *pu1_cur_ptr;
2537
0
            UWORD8 *pu1_ref_ptr;
2538
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2539
2540
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2541
0
            {
2542
0
                continue;
2543
0
            }
2544
2545
0
            ps_err_prms->pu1_inp =
2546
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2547
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2548
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2549
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2550
2551
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2552
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2553
2554
            /* Loop to compute the SAD's */
2555
0
            {
2556
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2557
0
                for(b = 0; b < NUM_4X4; b++)
2558
0
                {
2559
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2560
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2561
2562
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2563
0
                    {
2564
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2565
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2566
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2567
0
                        {
2568
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2569
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2570
0
                        }
2571
0
                    }
2572
0
                }
2573
2574
                /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2575
0
                hme_compute_sigmaX_and_sigmaXSquared(
2576
0
                    pu1_ref_ptr,
2577
0
                    ref_buf_stride,
2578
0
                    au4_4x4_ref_sigmaX,
2579
0
                    au4_4x4_ref_sigmaXSquared,
2580
0
                    4,
2581
0
                    4,
2582
0
                    16,
2583
0
                    16,
2584
0
                    1,
2585
0
                    4);
2586
2587
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2588
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2589
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2590
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2591
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2592
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2593
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2594
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2595
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2596
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2597
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2598
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2599
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2600
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2601
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2602
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2603
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2604
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2605
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2606
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2607
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2608
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2609
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2610
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2611
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2612
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2613
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2614
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2615
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2616
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2617
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2618
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2619
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2620
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2621
0
            }
2622
0
        }
2623
2624
0
        {
2625
0
            S32 i4_sad, i4_mv_cost, i4_tot_cost;
2626
0
            S32 best_node_cost;
2627
0
            S32 second_best_node_cost;
2628
0
            ULWORD64 u8_temp_var, u8_temp_var1;
2629
0
            ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2630
2631
0
            {
2632
0
                S16 mvdx1, mvdy1;
2633
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2634
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2635
0
                S32 pred_lx = i4_search_idx;
2636
2637
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2638
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2639
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2640
2641
0
                S32 inp_shift = 2;
2642
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2643
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2644
0
                S32 lambda = ps_pred_ctxt->lambda;
2645
0
                S32 rnd = 1 << (lambda_q_shift - 1);
2646
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2647
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2648
0
                S32 ref_bits =
2649
0
                    ps_pred_ctxt
2650
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2651
2652
0
                COMPUTE_DIFF_MV(
2653
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2654
2655
0
                mvdx1 = ABS(mvdx1);
2656
0
                mvdy1 = ABS(mvdy1);
2657
2658
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2659
0
                             (mvdy1 > 0) + ref_bits + 2;
2660
2661
0
                i4_mv_cost *= lambda;
2662
0
                i4_mv_cost += rnd;
2663
0
                i4_mv_cost >>= lambda_q_shift;
2664
2665
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2666
0
            }
2667
2668
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2669
0
            {
2670
0
                S32 i4_stim_injected_sad;
2671
0
                S32 i4_stim_injected_cost;
2672
0
                S32 i4_noise_term;
2673
0
                unsigned long u4_shift_val;
2674
0
                S32 i4_bits_req;
2675
2676
0
                S32 update_required = 0;
2677
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2678
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2679
2680
0
                WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2681
2682
0
                S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2683
2684
0
                if(ps_search_prms->i4_alpha_stim_multiplier)
2685
0
                {
2686
                    /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2687
0
                    hme_compute_final_sigma_of_pu_from_base_blocks(
2688
0
                        au4_4x4_ref_sigmaX,
2689
0
                        au4_4x4_ref_sigmaXSquared,
2690
0
                        au8_final_ref_sigmaX,
2691
0
                        au8_final_ref_sigmaXSquared,
2692
0
                        16,
2693
0
                        4,
2694
0
                        part_id,
2695
0
                        4);
2696
2697
0
                    u8_ref_X_Square =
2698
0
                        (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2699
0
                    u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2700
2701
                    /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2702
                    /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2703
                    /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2704
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
2705
0
                        au8_final_src_sigmaX,
2706
0
                        au8_final_src_sigmaXSquared,
2707
0
                        &u8_src_var,
2708
0
                        i4_inv_wt,
2709
0
                        ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2710
0
                        ps_wt_inp_prms->wpred_log_wdc,
2711
0
                        part_id);
2712
2713
0
                    u8_ref_var = u8_ref_var >> u4_shift_val;
2714
2715
                    /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2716
0
                    GETRANGE64(i4_bits_req, u8_ref_var);
2717
2718
0
                    if(i4_bits_req > 27)
2719
0
                    {
2720
0
                        u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2721
0
                        u8_src_var = u8_src_var >> (i4_bits_req - 27);
2722
0
                    }
2723
2724
0
                    if(u8_src_var == u8_ref_var)
2725
0
                    {
2726
0
                        u8_temp_var = (1 << STIM_Q_FORMAT);
2727
0
                    }
2728
0
                    else
2729
0
                    {
2730
0
                        u8_temp_var = (2 * u8_src_var * u8_ref_var);
2731
0
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2732
0
                        u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2733
0
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2734
0
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
2735
0
                    }
2736
2737
0
                    i4_noise_term = (UWORD32)u8_temp_var;
2738
2739
0
                    ASSERT(i4_noise_term >= 0);
2740
2741
0
                    i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2742
0
                }
2743
0
                else
2744
0
                {
2745
0
                    i4_noise_term = 0;
2746
0
                }
2747
0
                u8_pure_dist = pi4_sad_grid[part_id];
2748
0
                u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2749
0
                u8_pure_dist += (1 << ((i4_q_level)-1));
2750
0
                i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2751
2752
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2753
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2754
0
                i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2755
0
                i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2756
2757
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2758
0
                second_best_node_cost =
2759
0
                    CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2760
2761
0
                if(i4_stim_injected_cost < second_best_node_cost)
2762
0
                {
2763
0
                    update_required = 2;
2764
2765
0
                    if(i4_stim_injected_cost < best_node_cost)
2766
0
                    {
2767
0
                        update_required = 1;
2768
0
                    }
2769
0
                    else if(i4_stim_injected_cost == best_node_cost)
2770
0
                    {
2771
0
                        update_required = 0;
2772
0
                    }
2773
2774
0
                    if(update_required == 2)
2775
0
                    {
2776
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2777
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2778
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2779
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2780
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2781
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2782
0
                    }
2783
0
                    else if(update_required == 1)
2784
0
                    {
2785
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2786
0
                            ps_mv_refine_ctxt->i2_tot_cost[0][index];
2787
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2788
0
                            ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2789
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2790
0
                            ps_mv_refine_ctxt->i2_mv_cost[0][index];
2791
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2792
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2793
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2794
0
                            ps_mv_refine_ctxt->i2_ref_idx[0][index];
2795
2796
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2797
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2798
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2799
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2800
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2801
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2802
0
                    }
2803
0
                }
2804
0
            }
2805
0
        }
2806
2807
0
        ps_search_node++;
2808
0
    }
2809
2810
0
    {
2811
0
        WORD32 i4_i;
2812
0
        WORD32 part_id;
2813
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2814
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2815
0
        {
2816
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2817
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2818
0
            {
2819
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2820
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2821
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2822
2823
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2824
0
            }
2825
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2826
0
            {
2827
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2828
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2829
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2830
2831
0
                ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2832
0
            }
2833
0
        }
2834
0
    }
2835
0
}
2836
2837
void hme_calc_sad_and_1_best_result(
2838
    hme_search_prms_t *ps_search_prms,
2839
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2840
    err_prms_t *ps_err_prms,
2841
    result_upd_prms_t *ps_result_prms,
2842
    U08 **ppu1_ref,
2843
    S32 i4_ref_stride)
2844
279k
{
2845
279k
    S32 i4_candt;
2846
279k
    S32 i4_inp_off;
2847
279k
    S32 i4_ref_offset;
2848
279k
    S32 i4_num_nodes;
2849
2850
279k
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2851
279k
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2852
279k
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2853
279k
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2854
279k
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2855
2856
279k
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2857
279k
    search_node_t *ps_search_node;
2858
2859
279k
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2860
279k
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2861
279k
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2862
279k
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2863
279k
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2864
279k
    ps_search_node = ps_search_prms->ps_search_nodes;
2865
2866
2.79M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2867
2.51M
    {
2868
        /**********************************************************************/
2869
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2870
        /**********************************************************************/
2871
2.51M
        {
2872
2.51M
            WORD32 b, c, d;
2873
2.51M
            UWORD8 *pu1_cur_ptr;
2874
2.51M
            UWORD8 *pu1_ref_ptr;
2875
2.51M
            UWORD16 au2_4x4_sad[NUM_4X4];
2876
2877
2.51M
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2878
0
            {
2879
0
                continue;
2880
0
            }
2881
2882
2.51M
            ps_err_prms->pu1_inp =
2883
2.51M
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2884
2.51M
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2885
2.51M
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2886
2.51M
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2887
2888
2.51M
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2889
2.51M
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2890
2891
            /* Loop to compute the SAD's */
2892
2.51M
            {
2893
2.51M
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2894
42.8M
                for(b = 0; b < NUM_4X4; b++)
2895
40.2M
                {
2896
40.2M
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2897
40.2M
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2898
2899
201M
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2900
161M
                    {
2901
161M
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2902
161M
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2903
805M
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2904
644M
                        {
2905
644M
                            au2_4x4_sad[b] += (UWORD16)ABS((
2906
644M
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2907
644M
                        }
2908
161M
                    }
2909
40.2M
                }
2910
2911
2.51M
                pi4_sad_grid[PART_ID_NxN_TL] =
2912
2.51M
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2913
2.51M
                pi4_sad_grid[PART_ID_NxN_TR] =
2914
2.51M
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2915
2.51M
                pi4_sad_grid[PART_ID_NxN_BL] =
2916
2.51M
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2917
2.51M
                pi4_sad_grid[PART_ID_NxN_BR] =
2918
2.51M
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2919
2.51M
                pi4_sad_grid[PART_ID_Nx2N_L] =
2920
2.51M
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2921
2.51M
                pi4_sad_grid[PART_ID_Nx2N_R] =
2922
2.51M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2923
2.51M
                pi4_sad_grid[PART_ID_2NxN_T] =
2924
2.51M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2925
2.51M
                pi4_sad_grid[PART_ID_2NxN_B] =
2926
2.51M
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2927
2.51M
                pi4_sad_grid[PART_ID_nLx2N_L] =
2928
2.51M
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2929
2.51M
                pi4_sad_grid[PART_ID_nRx2N_R] =
2930
2.51M
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2931
2.51M
                pi4_sad_grid[PART_ID_2NxnU_T] =
2932
2.51M
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2933
2.51M
                pi4_sad_grid[PART_ID_2NxnD_B] =
2934
2.51M
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2935
2.51M
                pi4_sad_grid[PART_ID_2Nx2N] =
2936
2.51M
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2937
2.51M
                pi4_sad_grid[PART_ID_2NxnU_B] =
2938
2.51M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2939
2.51M
                pi4_sad_grid[PART_ID_2NxnD_T] =
2940
2.51M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2941
2.51M
                pi4_sad_grid[PART_ID_nRx2N_L] =
2942
2.51M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2943
2.51M
                pi4_sad_grid[PART_ID_nLx2N_R] =
2944
2.51M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2945
2.51M
            }
2946
2.51M
        }
2947
2948
0
        {
2949
2.51M
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2950
2.51M
            S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2951
2.51M
            S32 best_node_cost;
2952
2.51M
            S32 second_best_node_cost;
2953
2954
2.51M
            {
2955
2.51M
                S16 mvdx1, mvdy1;
2956
2.51M
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2957
2.51M
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2958
2.51M
                S32 pred_lx = i4_search_idx;
2959
2960
2.51M
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2961
2.51M
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2962
2.51M
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2963
2964
2.51M
                S32 inp_shift = 2;
2965
2.51M
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2966
2.51M
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2967
2.51M
                S32 lambda = ps_pred_ctxt->lambda;
2968
2.51M
                S32 rnd = 1 << (lambda_q_shift - 1);
2969
2.51M
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2970
2.51M
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2971
2.51M
                S32 ref_bits =
2972
2.51M
                    ps_pred_ctxt
2973
2.51M
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2974
2975
2.51M
                COMPUTE_DIFF_MV(
2976
2.51M
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2977
2978
2.51M
                mvdx1 = ABS(mvdx1);
2979
2.51M
                mvdy1 = ABS(mvdy1);
2980
2981
2.51M
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2982
2.51M
                             (mvdy1 > 0) + ref_bits + 2;
2983
2984
2.51M
                i4_mv_cost *= lambda;
2985
2.51M
                i4_mv_cost += rnd;
2986
2.51M
                i4_mv_cost >>= lambda_q_shift;
2987
2988
2.51M
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2989
2.51M
            }
2990
2991
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2992
            best candidates for that partition*/
2993
2994
27.8M
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2995
25.3M
            {
2996
25.3M
                S32 update_required = 0;
2997
25.3M
                S32 part_id = pi4_valid_part_ids[i4_count];
2998
25.3M
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2999
3000
                /*Calculate total cost*/
3001
25.3M
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3002
25.3M
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3003
3004
                /*****************************************************************/
3005
                /* We do not labor through the results if the total cost worse   */
3006
                /* than the last of the results.                                 */
3007
                /*****************************************************************/
3008
25.3M
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3009
25.3M
                second_best_node_cost = SHRT_MAX;
3010
3011
25.3M
                if(i4_tot_cost < second_best_node_cost)
3012
25.2M
                {
3013
25.2M
                    update_required = 0;
3014
3015
                    /*************************************************************/
3016
                    /* Identify where the current result isto be placed.Basically*/
3017
                    /* find the node which has cost just higher thannodeundertest*/
3018
                    /*************************************************************/
3019
25.2M
                    if(i4_tot_cost < best_node_cost)
3020
3.41M
                    {
3021
3.41M
                        update_required = 1;
3022
3.41M
                    }
3023
21.8M
                    else if(i4_tot_cost == best_node_cost)
3024
2.09M
                    {
3025
2.09M
                        update_required = 0;
3026
2.09M
                    }
3027
3028
25.2M
                    if(update_required == 2)
3029
0
                    {
3030
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3031
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3032
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3033
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3034
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3035
0
                    }
3036
25.2M
                    else if(update_required == 1)
3037
3.41M
                    {
3038
3.41M
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3039
3.41M
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3040
3.41M
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3041
3.41M
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3042
3.41M
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3043
3.41M
                    }
3044
25.2M
                }
3045
25.3M
            }
3046
2.51M
        }
3047
2.51M
        ps_search_node++;
3048
2.51M
    }
3049
3050
279k
    {
3051
279k
        WORD32 i4_i;
3052
279k
        WORD32 part_id;
3053
279k
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3054
2.64M
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3055
2.37M
        {
3056
2.37M
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3057
2.37M
            if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3058
173k
            {
3059
173k
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3060
173k
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3061
173k
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3062
3063
173k
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3064
173k
            }
3065
2.37M
        }
3066
279k
    }
3067
279k
}
3068
3069
void hme_calc_stim_injected_sad_and_1_best_result(
3070
    hme_search_prms_t *ps_search_prms,
3071
    wgt_pred_ctxt_t *ps_wt_inp_prms,
3072
    err_prms_t *ps_err_prms,
3073
    result_upd_prms_t *ps_result_prms,
3074
    U08 **ppu1_ref,
3075
    S32 i4_ref_stride)
3076
0
{
3077
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
3078
0
    search_node_t *ps_search_node;
3079
3080
0
    S32 i4_candt;
3081
0
    S32 i4_count;
3082
0
    S32 i4_inp_off;
3083
0
    S32 i4_ref_offset;
3084
0
    S32 i4_num_nodes;
3085
0
    ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3086
0
        au8_final_ref_sigmaXSquared[17];
3087
0
    UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3088
0
    S32 *pi4_valid_part_ids;
3089
3090
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3091
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3092
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3093
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3094
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3095
3096
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3097
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3098
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
3099
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3100
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3101
0
    ps_search_node = ps_search_prms->ps_search_nodes;
3102
0
    pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3103
3104
    /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3105
0
    au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3106
0
    au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3107
3108
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3109
0
    {
3110
0
        {
3111
0
            WORD32 b, c, d;
3112
0
            UWORD8 *pu1_cur_ptr;
3113
0
            UWORD8 *pu1_ref_ptr;
3114
0
            UWORD16 au2_4x4_sad[NUM_4X4];
3115
3116
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3117
0
            {
3118
0
                continue;
3119
0
            }
3120
3121
0
            ps_err_prms->pu1_inp =
3122
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3123
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3124
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3125
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3126
3127
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
3128
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3129
3130
            /* Loop to compute the SAD's */
3131
0
            {
3132
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3133
0
                for(b = 0; b < NUM_4X4; b++)
3134
0
                {
3135
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3136
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3137
3138
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3139
0
                    {
3140
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
3141
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
3142
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3143
0
                        {
3144
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
3145
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3146
0
                        }
3147
0
                    }
3148
0
                }
3149
3150
                /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3151
0
                hme_compute_sigmaX_and_sigmaXSquared(
3152
0
                    pu1_ref_ptr,
3153
0
                    ref_buf_stride,
3154
0
                    au4_4x4_ref_sigmaX,
3155
0
                    au4_4x4_ref_sigmaXSquared,
3156
0
                    4,
3157
0
                    4,
3158
0
                    16,
3159
0
                    16,
3160
0
                    1,
3161
0
                    4);
3162
3163
0
                pi4_sad_grid[PART_ID_NxN_TL] =
3164
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3165
0
                pi4_sad_grid[PART_ID_NxN_TR] =
3166
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3167
0
                pi4_sad_grid[PART_ID_NxN_BL] =
3168
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3169
0
                pi4_sad_grid[PART_ID_NxN_BR] =
3170
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3171
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
3172
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3173
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
3174
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3175
0
                pi4_sad_grid[PART_ID_2NxN_T] =
3176
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3177
0
                pi4_sad_grid[PART_ID_2NxN_B] =
3178
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3179
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
3180
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3181
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
3182
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3183
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
3184
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3185
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
3186
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3187
0
                pi4_sad_grid[PART_ID_2Nx2N] =
3188
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3189
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
3190
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3191
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
3192
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3193
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
3194
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3195
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
3196
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3197
0
            }
3198
0
        }
3199
3200
0
        {
3201
0
            S32 i4_sad, i4_mv_cost, i4_tot_cost;
3202
0
            S32 best_node_cost;
3203
0
            S32 second_best_node_cost;
3204
0
            ULWORD64 u8_temp_var, u8_temp_var1;
3205
0
            ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3206
3207
0
            {
3208
0
                S16 mvdx1, mvdy1;
3209
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3210
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3211
0
                S32 pred_lx = i4_search_idx;
3212
3213
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3214
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3215
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3216
3217
0
                S32 inp_shift = 2;
3218
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3219
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3220
0
                S32 lambda = ps_pred_ctxt->lambda;
3221
0
                S32 rnd = 1 << (lambda_q_shift - 1);
3222
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3223
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3224
0
                S32 ref_bits =
3225
0
                    ps_pred_ctxt
3226
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3227
3228
0
                COMPUTE_DIFF_MV(
3229
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3230
3231
0
                mvdx1 = ABS(mvdx1);
3232
0
                mvdy1 = ABS(mvdy1);
3233
3234
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3235
0
                             (mvdy1 > 0) + ref_bits + 2;
3236
3237
0
                i4_mv_cost *= lambda;
3238
0
                i4_mv_cost += rnd;
3239
0
                i4_mv_cost >>= lambda_q_shift;
3240
3241
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
3242
0
            }
3243
3244
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3245
0
            {
3246
0
                S32 i4_stim_injected_sad;
3247
0
                S32 i4_stim_injected_cost;
3248
0
                S32 i4_noise_term;
3249
0
                unsigned long u4_shift_val;
3250
0
                S32 i4_bits_req;
3251
3252
0
                S32 update_required = 0;
3253
0
                S32 part_id = pi4_valid_part_ids[i4_count];
3254
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3255
3256
0
                WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3257
3258
0
                S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3259
3260
0
                if(ps_search_prms->i4_alpha_stim_multiplier)
3261
0
                {
3262
                    /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3263
0
                    hme_compute_final_sigma_of_pu_from_base_blocks(
3264
0
                        au4_4x4_ref_sigmaX,
3265
0
                        au4_4x4_ref_sigmaXSquared,
3266
0
                        au8_final_ref_sigmaX,
3267
0
                        au8_final_ref_sigmaXSquared,
3268
0
                        16,
3269
0
                        4,
3270
0
                        part_id,
3271
0
                        4);
3272
3273
0
                    u8_ref_X_Square =
3274
0
                        (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3275
0
                    u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3276
3277
                    /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3278
                    /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3279
                    /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3280
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
3281
0
                        au8_final_src_sigmaX,
3282
0
                        au8_final_src_sigmaXSquared,
3283
0
                        &u8_src_var,
3284
0
                        i4_inv_wt,
3285
0
                        ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3286
0
                        ps_wt_inp_prms->wpred_log_wdc,
3287
0
                        part_id);
3288
3289
0
                    u8_ref_var = u8_ref_var >> u4_shift_val;
3290
3291
                    /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3292
0
                    GETRANGE64(i4_bits_req, u8_ref_var);
3293
3294
0
                    if(i4_bits_req > 27)
3295
0
                    {
3296
0
                        u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3297
0
                        u8_src_var = u8_src_var >> (i4_bits_req - 27);
3298
0
                    }
3299
3300
0
                    if(u8_src_var == u8_ref_var)
3301
0
                    {
3302
0
                        u8_temp_var = (1 << STIM_Q_FORMAT);
3303
0
                    }
3304
0
                    else
3305
0
                    {
3306
0
                        u8_temp_var = (2 * u8_src_var * u8_ref_var);
3307
0
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3308
0
                        u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3309
0
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3310
0
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
3311
0
                    }
3312
3313
0
                    i4_noise_term = (UWORD32)u8_temp_var;
3314
3315
0
                    ASSERT(i4_noise_term >= 0);
3316
3317
0
                    i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3318
0
                }
3319
0
                else
3320
0
                {
3321
0
                    i4_noise_term = 0;
3322
0
                }
3323
0
                u8_pure_dist = pi4_sad_grid[part_id];
3324
0
                u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3325
0
                u8_pure_dist += (1 << ((i4_q_level)-1));
3326
0
                i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3327
3328
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3329
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3330
0
                i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3331
0
                i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3332
3333
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3334
0
                second_best_node_cost = SHRT_MAX;
3335
3336
0
                if(i4_stim_injected_cost < second_best_node_cost)
3337
0
                {
3338
0
                    update_required = 0;
3339
3340
0
                    if(i4_stim_injected_cost < best_node_cost)
3341
0
                    {
3342
0
                        update_required = 1;
3343
0
                    }
3344
0
                    else if(i4_stim_injected_cost == best_node_cost)
3345
0
                    {
3346
0
                        update_required = 0;
3347
0
                    }
3348
3349
0
                    if(update_required == 2)
3350
0
                    {
3351
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3352
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3353
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3354
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3355
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3356
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3357
0
                    }
3358
0
                    else if(update_required == 1)
3359
0
                    {
3360
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3361
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3362
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3363
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3364
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3365
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3366
0
                    }
3367
0
                }
3368
0
            }
3369
0
        }
3370
3371
0
        ps_search_node++;
3372
0
    }
3373
3374
0
    {
3375
0
        WORD32 i4_i;
3376
0
        WORD32 part_id;
3377
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3378
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3379
0
        {
3380
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3381
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3382
0
            {
3383
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3384
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3385
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3386
3387
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3388
0
            }
3389
0
        }
3390
0
    }
3391
0
}
3392
3393
void hme_calc_sad_and_1_best_result_subpel(
3394
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3395
508k
{
3396
508k
    S32 i4_candt;
3397
508k
    S32 i4_num_nodes;
3398
3399
508k
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3400
3401
508k
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3402
508k
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3403
508k
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3404
508k
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3405
3406
508k
    mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3407
508k
    ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3408
508k
    i4_num_nodes = 1;
3409
3410
    /* Run through each of the candts in a loop */
3411
1.01M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3412
508k
    {
3413
        /**********************************************************************/
3414
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3415
        /**********************************************************************/
3416
508k
        {
3417
508k
            WORD32 b, c, d;
3418
508k
            UWORD8 *pu1_cur_ptr;
3419
508k
            UWORD8 *pu1_ref_ptr;
3420
508k
            UWORD16 au2_4x4_sad[NUM_4X4];
3421
3422
508k
            pu1_cur_ptr = ps_err_prms->pu1_inp;
3423
508k
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3424
3425
            /* Loop to compute the SAD's */
3426
508k
            {
3427
508k
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3428
8.64M
                for(b = 0; b < NUM_4X4; b++)
3429
8.13M
                {
3430
8.13M
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3431
8.13M
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3432
3433
40.6M
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3434
32.5M
                    {
3435
32.5M
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
3436
32.5M
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
3437
162M
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3438
130M
                        {
3439
130M
                            au2_4x4_sad[b] += (UWORD16)ABS((
3440
130M
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3441
130M
                        }
3442
32.5M
                    }
3443
8.13M
                }
3444
3445
508k
                pi4_sad_grid[PART_ID_NxN_TL] =
3446
508k
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3447
508k
                pi4_sad_grid[PART_ID_NxN_TR] =
3448
508k
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3449
508k
                pi4_sad_grid[PART_ID_NxN_BL] =
3450
508k
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3451
508k
                pi4_sad_grid[PART_ID_NxN_BR] =
3452
508k
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3453
508k
                pi4_sad_grid[PART_ID_Nx2N_L] =
3454
508k
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3455
508k
                pi4_sad_grid[PART_ID_Nx2N_R] =
3456
508k
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3457
508k
                pi4_sad_grid[PART_ID_2NxN_T] =
3458
508k
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3459
508k
                pi4_sad_grid[PART_ID_2NxN_B] =
3460
508k
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3461
508k
                pi4_sad_grid[PART_ID_nLx2N_L] =
3462
508k
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3463
508k
                pi4_sad_grid[PART_ID_nRx2N_R] =
3464
508k
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3465
508k
                pi4_sad_grid[PART_ID_2NxnU_T] =
3466
508k
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3467
508k
                pi4_sad_grid[PART_ID_2NxnD_B] =
3468
508k
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3469
508k
                pi4_sad_grid[PART_ID_2Nx2N] =
3470
508k
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3471
508k
                pi4_sad_grid[PART_ID_2NxnU_B] =
3472
508k
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3473
508k
                pi4_sad_grid[PART_ID_2NxnD_T] =
3474
508k
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3475
508k
                pi4_sad_grid[PART_ID_nRx2N_L] =
3476
508k
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3477
508k
                pi4_sad_grid[PART_ID_nLx2N_R] =
3478
508k
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3479
508k
            }
3480
508k
        }
3481
        /**********************************************************************/
3482
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3483
        /**********************************************************************/
3484
508k
        {
3485
508k
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3486
508k
            S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3487
508k
            S32 best_node_cost;
3488
508k
            S32 second_best_node_cost;
3489
3490
            /*For each valid partition, update the refine_prm structure to reflect the best and second
3491
            best candidates for that partition*/
3492
3493
1.92M
            for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3494
1.41M
            {
3495
1.41M
                S32 update_required = 0;
3496
1.41M
                S32 part_id = pi4_valid_part_ids[i4_count];
3497
1.41M
                S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3498
3499
                /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3500
1.41M
                i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3501
3502
                /*Calculate total cost*/
3503
1.41M
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3504
1.41M
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3505
3506
                /*****************************************************************/
3507
                /* We do not labor through the results if the total cost worse   */
3508
                /* than the last of the results.                                 */
3509
                /*****************************************************************/
3510
1.41M
                best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3511
1.41M
                second_best_node_cost = SHRT_MAX;
3512
3513
1.41M
                if(i4_tot_cost < second_best_node_cost)
3514
1.41M
                {
3515
1.41M
                    update_required = 0;
3516
3517
                    /*************************************************************/
3518
                    /* Identify where the current result isto be placed.Basically*/
3519
                    /* find the node which has cost just higher thannodeundertest*/
3520
                    /*************************************************************/
3521
1.41M
                    if(i4_tot_cost < best_node_cost)
3522
67.5k
                    {
3523
67.5k
                        update_required = 1;
3524
67.5k
                    }
3525
1.34M
                    else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3526
446k
                    {
3527
446k
                        update_required = 0;
3528
446k
                    }
3529
1.41M
                    if(update_required == 2)
3530
0
                    {
3531
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3532
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3533
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3534
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3535
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3536
0
                    }
3537
1.41M
                    else if(update_required == 1)
3538
67.5k
                    {
3539
67.5k
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3540
67.5k
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3541
67.5k
                        ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3542
67.5k
                        ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3543
67.5k
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3544
67.5k
                    }
3545
1.41M
                }
3546
1.41M
            }
3547
508k
        }
3548
508k
    }
3549
3550
508k
    {
3551
508k
        WORD32 i4_count = 0;
3552
9.15M
        for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3553
8.64M
        {
3554
8.64M
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3555
7.23M
            {
3556
7.23M
                ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3557
7.23M
            }
3558
8.64M
        }
3559
508k
    }
3560
508k
}
3561
3562
/**
3563
********************************************************************************
3564
*  @fn     hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3565
*                                              wgt_pred_ctxt_t *ps_wt_inp_prms,
3566
*                                              err_prms_t *ps_err_prms,
3567
*                                              result_upd_prms_t *ps_result_prms,
3568
*                                              U08 **ppu1_ref,
3569
*                                              S32 i4_ref_stride)
3570
*
3571
*  @brief   Run thorugh the provided candidates and compute the point SAD and
3572
*           cost and update the results in the order
3573
*
3574
*  @param[in]  ps_search_prms
3575
*  @param[in]  ps_wt_inp_prms
3576
*  @param[in]  ps_err_prms
3577
*  @param[out] ps_result_prms
3578
*  @param[in]  ppu1_ref
3579
*  @param[in]  i4_ref_stride
3580
*
3581
*  @return   None
3582
********************************************************************************
3583
*/
3584
3585
void hme_calc_pt_sad_and_result_explicit(
3586
    hme_search_prms_t *ps_search_prms,
3587
    wgt_pred_ctxt_t *ps_wt_inp_prms,
3588
    err_prms_t *ps_err_prms,
3589
    result_upd_prms_t *ps_result_prms,
3590
    U08 **ppu1_ref,
3591
    S32 i4_ref_stride)
3592
403k
{
3593
403k
    WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3594
403k
    WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3595
3596
403k
    search_node_t *ps_search_node;
3597
403k
    BLK_SIZE_T e_blk_size;
3598
403k
    PF_SAD_FXN_T pf_sad_fxn;
3599
403k
    PF_RESULT_FXN_T pf_hme_result_fxn;
3600
3601
403k
    i4_grid_mask = 0x1; /* Point SAD */
3602
3603
    /* Get the parameters required */
3604
403k
    i4_part_mask = ps_search_prms->i4_part_mask;
3605
403k
    e_blk_size = ps_search_prms->e_blk_size;
3606
403k
    i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3607
403k
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3608
403k
    ps_search_node = ps_search_prms->ps_search_nodes;
3609
3610
403k
    i4_inp_stride = ps_search_prms->i4_inp_stride;
3611
    /* Move to the location of the search blk in inp buffer */
3612
403k
    i4_inp_off = ps_search_prms->i4_cu_x_off;
3613
403k
    i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3614
403k
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3615
3616
403k
    pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3617
    /**********************************************************************/
3618
    /* we have a sparsely populated SAD grid of size 9x17.                */
3619
    /* the id of the results in the grid is shown                         */
3620
    /*     5   2   6                                                      */
3621
    /*     1   0   3                                                      */
3622
    /*     7   4   8                                                      */
3623
    /* The motivation for choosing a grid like this is that               */
3624
    /* in case of no refinement, the central location is                  */
3625
    /* the first entry in the grid                                        */
3626
    /* Also for diamond, the 4 entries get considered first               */
3627
    /* This is consistent with the diamond notation used in               */
3628
    /* subpel refinement. To Check                                        */
3629
    /* Update the results for the given search candt                      */
3630
    /* returns the cost of the 2Nx2N partition                            */
3631
    /**********************************************************************/
3632
3633
    /* Get the modified update result fun. with CLIP16 of cost to match   */
3634
    /* with SIMD */
3635
403k
    pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3636
3637
4.01M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3638
3.60M
    {
3639
3.60M
        if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3640
0
            continue;
3641
3642
        /* initialize minimum cost for this candidate. As we search around */
3643
        /* this candidate, this is used to check early exit, when in any   */
3644
        /* given iteration, the center pt of the grid is lowest value      */
3645
3.60M
        ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3646
3647
3.60M
        ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3648
3.60M
        ps_err_prms->i4_grid_mask = i4_grid_mask;
3649
3650
3.60M
        ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3651
3.60M
        ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3652
3.60M
        ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3653
3654
        /**********************************************************************/
3655
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3656
        /**********************************************************************/
3657
3.60M
        pf_sad_fxn(ps_err_prms);
3658
3659
        /**********************************************************************/
3660
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3661
        /**********************************************************************/
3662
3.60M
        ps_result_prms->i4_grid_mask = i4_grid_mask;
3663
3.60M
        ps_result_prms->ps_search_node_base = ps_search_node;
3664
3.60M
        pf_hme_result_fxn(ps_result_prms);
3665
3666
3.60M
        ps_search_node++;
3667
3.60M
    }
3668
403k
}
3669
3670
/**
3671
********************************************************************************
3672
*  @fn     hme_set_mvp_node(search_results_t *ps_search_results,
3673
*                           search_node_t *ps_candt_prj_coloc,
3674
*                           S08 i1_ref_idx)
3675
*
3676
*  @brief   Set node used for motion vector predictor computation
3677
*           Either TR or L is compared to projected colocated and
3678
*           closest is decided as MVP
3679
*
3680
*  @param[in]  ps_search_results
3681
*
3682
*  @param[in]  ps_candt_prj_coloc
3683
*
3684
*  @param[in]  i1_ref_idx
3685
*
3686
*  @return   None
3687
********************************************************************************
3688
*/
3689
void hme_set_mvp_node(
3690
    search_results_t *ps_search_results,
3691
    search_node_t *ps_candt_prj_coloc,
3692
    U08 u1_pred_lx,
3693
    U08 u1_default_ref_id)
3694
164k
{
3695
164k
    S32 i;
3696
164k
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3697
164k
    pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3698
164k
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3699
3700
164k
    S32 inp_shift = 2;
3701
164k
    S32 pred_shift;
3702
164k
    S32 ref_bits;
3703
164k
    S32 mv_p_x, mv_p_y;
3704
164k
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
3705
3706
164k
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3707
3708
    /*************************************************************************/
3709
    /* Priority to bottom left availability. Else we go to left. If both are */
3710
    /* not available, then a remains null                                    */
3711
    /*************************************************************************/
3712
164k
    if(ps_pred_nodes->ps_l->u1_is_avail)
3713
139k
    {
3714
139k
        ps_pred_node_a = ps_pred_nodes->ps_l;
3715
139k
    }
3716
3717
164k
    if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3718
70.2k
    {
3719
70.2k
        ps_pred_node_b = ps_pred_nodes->ps_tr;
3720
70.2k
    }
3721
93.9k
    else
3722
93.9k
    {
3723
93.9k
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
3724
93.9k
        ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3725
93.9k
    }
3726
3727
164k
    if(ps_pred_node_a == NULL)
3728
24.2k
    {
3729
24.2k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
3730
24.2k
        ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3731
3732
24.2k
        if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3733
7.46k
        {
3734
7.46k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3735
7.46k
            ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3736
7.46k
        }
3737
24.2k
    }
3738
3739
164k
    if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3740
70.7k
    {
3741
70.7k
        SCALE_FOR_POC_DELTA(
3742
70.7k
            mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3743
70.7k
    }
3744
93.4k
    else
3745
93.4k
    {
3746
93.4k
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3747
93.4k
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3748
93.4k
    }
3749
164k
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3750
164k
    COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3751
164k
    mvdx1 = ABS(mvdx1);
3752
164k
    mvdy1 = ABS(mvdy1);
3753
3754
164k
    if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3755
70.6k
    {
3756
70.6k
        SCALE_FOR_POC_DELTA(
3757
70.6k
            mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3758
70.6k
    }
3759
93.5k
    else
3760
93.5k
    {
3761
93.5k
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3762
93.5k
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3763
93.5k
    }
3764
164k
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3765
164k
    COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3766
164k
    mvdx2 = ABS(mvdx2);
3767
164k
    mvdy2 = ABS(mvdy2);
3768
3769
164k
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3770
43.2k
    {
3771
779k
        for(i = 0; i < TOT_NUM_PARTS; i++)
3772
736k
        {
3773
736k
            ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3774
736k
        }
3775
43.2k
    }
3776
120k
    else
3777
120k
    {
3778
2.17M
        for(i = 0; i < TOT_NUM_PARTS; i++)
3779
2.05M
        {
3780
2.05M
            ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3781
2.05M
        }
3782
120k
    }
3783
164k
}