Coverage Report

Created: 2025-08-03 06:13

/src/libhevc/encoder/hme_err_compute.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/*!
22
***************************************************************************
23
* \file hme_err_compute.c
24
*
25
* \brief
26
*    SAD / SATD routines for error computation
27
*
28
* Detailed_description : Contains various types of SAD/SATD routines for
29
*   error computation between a given input and reference ptr. The SAD
30
*   routines can evaluate for either a single point or a grid, and can
31
*   evaluate with either partial updates or no partial updates. Partial
32
*   updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33
*   addition to the main 8x8 block SAD.
34
*
35
* \date
36
*    22/9/2012
37
*
38
* \author  Ittiam
39
***************************************************************************
40
*/
41
42
/*****************************************************************************/
43
/* File Includes                                                             */
44
/*****************************************************************************/
45
/* System include files */
46
#include <stdio.h>
47
#include <string.h>
48
#include <stdlib.h>
49
#include <assert.h>
50
#include <stdarg.h>
51
#include <math.h>
52
#include <limits.h>
53
54
/* User include files */
55
#include "ihevc_typedefs.h"
56
#include "itt_video_api.h"
57
#include "ihevce_api.h"
58
59
#include "rc_cntrl_param.h"
60
#include "rc_frame_info_collector.h"
61
#include "rc_look_ahead_params.h"
62
63
#include "ihevc_defs.h"
64
#include "ihevc_structs.h"
65
#include "ihevc_platform_macros.h"
66
#include "ihevc_deblk.h"
67
#include "ihevc_itrans_recon.h"
68
#include "ihevc_chroma_itrans_recon.h"
69
#include "ihevc_chroma_intra_pred.h"
70
#include "ihevc_intra_pred.h"
71
#include "ihevc_inter_pred.h"
72
#include "ihevc_mem_fns.h"
73
#include "ihevc_padding.h"
74
#include "ihevc_weighted_pred.h"
75
#include "ihevc_sao.h"
76
#include "ihevc_resi_trans.h"
77
#include "ihevc_quant_iquant_ssd.h"
78
#include "ihevc_cabac_tables.h"
79
80
#include "ihevce_defs.h"
81
#include "ihevce_lap_enc_structs.h"
82
#include "ihevce_multi_thrd_structs.h"
83
#include "ihevce_multi_thrd_funcs.h"
84
#include "ihevce_me_common_defs.h"
85
#include "ihevce_had_satd.h"
86
#include "ihevce_error_codes.h"
87
#include "ihevce_bitstream.h"
88
#include "ihevce_cabac.h"
89
#include "ihevce_rdoq_macros.h"
90
#include "ihevce_function_selector.h"
91
#include "ihevce_enc_structs.h"
92
#include "ihevce_entropy_structs.h"
93
#include "ihevce_cmn_utils_instr_set_router.h"
94
#include "ihevce_enc_loop_structs.h"
95
#include "ihevce_bs_compute_ctb.h"
96
#include "ihevce_global_tables.h"
97
#include "ihevce_dep_mngr_interface.h"
98
#include "hme_datatype.h"
99
#include "hme_interface.h"
100
#include "hme_common_defs.h"
101
#include "hme_defs.h"
102
#include "ihevce_me_instr_set_router.h"
103
#include "hme_globals.h"
104
#include "hme_utils.h"
105
#include "hme_coarse.h"
106
#include "hme_refine.h"
107
#include "hme_err_compute.h"
108
#include "hme_common_utils.h"
109
#include "hme_search_algo.h"
110
#include "ihevce_stasino_helpers.h"
111
112
/******************************************************************************
113
*                         MACRO DEFINITIONS
114
******************************************************************************/
115
116
/*****************************************************************************/
117
/* Theoritically, the various types of SAD functions that are needed for     */
118
/* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119
/* more optimal than SADs that are to be evaluated for a grid of 3x3. The    */
120
/* SADs to be evaluated at a grid are classified as separate functions, since*/
121
/* evaluating them on a single function call helps reuse inputs for a small  */
122
/* grid of 3x3. Also, if no partial updates are required, there are 3 basic  */
123
/* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124
/* 16K, K any number. For partial updates, it is assumed that the block size */
125
/* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done  */
126
/* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127
/* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then  */
128
/* basic SAD unit is 8x8.                                                    */
129
/*****************************************************************************/
130
131
#define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132
#define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133
#define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134
#define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135
#define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136
#define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137
#define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138
#define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139
140
/*******************************************************************************
141
*                         FUNCTION DEFINITIONS
142
*******************************************************************************/
143
S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144
41.5M
{
145
41.5M
    if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146
41.5M
       (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147
41.5M
       (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148
37.2M
    {
149
37.2M
        return 0;
150
37.2M
    }
151
4.21M
    return -1;
152
41.5M
}
153
154
void compute_4x4_sads_for_16x16_blk(
155
    grid_ctxt_t *ps_grid, /* Grid ctxt */
156
    UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157
    WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158
    UWORD16 **
159
        u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160
    cand_t *ps_cand, /* Return the list of candidates evaluated */
161
    WORD32 *num_cands /* Number of candidates that were processed */
162
)
163
0
{
164
0
    WORD32 a, b, c, d, i;
165
0
    WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166
0
    WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167
    //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168
    //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169
    /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170
0
    WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171
0
    WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172
0
    WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175
0
    cand_t *cand0 = ps_cand;
176
0
    UWORD16 au2_4x4_sad[NUM_4X4];
177
178
0
    *num_cands = 0;
179
180
    /* Loop to fill up the cand_t array and to calculate num_cands */
181
0
    for(i = 0; i < ps_grid->num_grids; i++)
182
0
    {
183
0
        WORD32 j;
184
0
        WORD32 mask = ps_grid->pi4_grd_mask[i];
185
0
        UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186
0
        WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187
0
        WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188
189
0
        for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190
0
        {
191
0
            if(mask & 1)
192
0
            {
193
0
                *num_cands = *num_cands + 1;
194
0
                cand0->grid_ix = i;
195
0
                cand0->ref_idx = ps_grid->p_ref_idx[i];
196
0
                cand0->pu1_ref_ptr =
197
0
                    pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198
0
                cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199
0
                cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200
0
                cand0++;
201
0
            }
202
0
        }
203
0
    }
204
205
    /* Loop to compute the SAD's */
206
0
    for(a = 0; a < *num_cands; a++)
207
0
    {
208
0
        cand_t *cand = ps_cand + a;
209
0
        memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210
0
        for(b = 0; b < NUM_4X4; b++)
211
0
        {
212
0
            WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213
0
            WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214
215
0
            for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216
0
            {
217
0
                WORD32 z_cur = (cur_buf_stride)*c + t1;
218
0
                WORD32 z_ref = (ref_buf_stride)*c + t2;
219
0
                for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220
0
                {
221
0
                    au2_4x4_sad[b] += (UWORD16)ABS(
222
0
                        (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223
0
                }
224
0
            }
225
0
        }
226
227
0
        u2_part_sads[PART_ID_NxN_TL][a] =
228
0
            (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229
0
        u2_part_sads[PART_ID_NxN_TR][a] =
230
0
            (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231
0
        u2_part_sads[PART_ID_NxN_BL][a] =
232
0
            (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233
0
        u2_part_sads[PART_ID_NxN_BR][a] =
234
0
            (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235
0
        u2_part_sads[PART_ID_Nx2N_L][a] =
236
0
            u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237
0
        u2_part_sads[PART_ID_Nx2N_R][a] =
238
0
            u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239
0
        u2_part_sads[PART_ID_2NxN_T][a] =
240
0
            u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241
0
        u2_part_sads[PART_ID_2NxN_B][a] =
242
0
            u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243
0
        u2_part_sads[PART_ID_nLx2N_L][a] =
244
0
            (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245
0
        u2_part_sads[PART_ID_nRx2N_R][a] =
246
0
            (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247
0
        u2_part_sads[PART_ID_2NxnU_T][a] =
248
0
            (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249
0
        u2_part_sads[PART_ID_2NxnD_B][a] =
250
0
            (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251
0
        u2_part_sads[PART_ID_2Nx2N][a] =
252
0
            u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253
0
        u2_part_sads[PART_ID_2NxnU_B][a] =
254
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255
0
        u2_part_sads[PART_ID_2NxnD_T][a] =
256
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257
0
        u2_part_sads[PART_ID_nRx2N_L][a] =
258
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259
0
        u2_part_sads[PART_ID_nLx2N_R][a] =
260
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261
0
    }
262
0
}
263
264
/**
265
********************************************************************************
266
*  @fn     compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267
*                                       UWORD8      *pu1_cur_ptr,
268
*                                       WORD32      cur_buf_stride,
269
*                                       WORD32     **pi4_part_sads,
270
*                                       cand_t      *ps_cand,
271
*                                       WORD32      *num_cands
272
*
273
*  @brief  Computes partial SADs and updates partition results for an MxM blk
274
*          and does so for several grids of points. This can be used for
275
*          32x32/64x64 blks with 17 partition updates
276
*
277
*
278
*  @param[in]  ps_grid : Pointer to grid ctxt that has multiple grid of max
279
*                        9 pts per grid
280
*
281
*  @param[in]  pu1_cur_ptr : Top left of input buffer
282
*
283
*  @param[in]  pi4_part_sads : array of pointers, each entry pointing to
284
*                             results to be updated for a given partition
285
*
286
*  @return   The ps_search_results structure has the best result updated for
287
*            the 2Nx2N partition alone.
288
289
********************************************************************************
290
*/
291
void compute_part_sads_for_MxM_blk(
292
    grid_ctxt_t *ps_grid,
293
    UWORD8 *pu1_cur_ptr,
294
    WORD32 cur_buf_stride,
295
    WORD32 **pp_part_sads,
296
    cand_t *ps_cand,
297
    WORD32 *num_cands,
298
    CU_SIZE_T e_cu_size)
299
23.6M
{
300
23.6M
    WORD32 a, b, c, d, i;
301
23.6M
    WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302
23.6M
    WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303
304
    /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305
23.6M
    WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306
23.6M
    WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307
23.6M
    WORD32 shift = (WORD32)e_cu_size;
308
309
23.6M
    WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310
23.6M
    WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311
23.6M
    WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312
    /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313
23.6M
    WORD32 num_rows_in_nxn = 2 << shift;
314
23.6M
    WORD32 num_pixels_in_row = 2 << shift;
315
23.6M
    cand_t *cand0 = ps_cand;
316
    /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317
    /* needed for AMP cases.                                              */
318
23.6M
    WORD32 a_nxn_sad[NUM_4X4];
319
23.6M
    *num_cands = 0;
320
321
    /* Loop to fill up the cand_t array and to calculate num_cands */
322
47.3M
    for(i = 0; i < ps_grid->num_grids; i++)
323
23.6M
    {
324
23.6M
        WORD32 j;
325
23.6M
        WORD32 mask = ps_grid->pi4_grd_mask[i];
326
23.6M
        UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327
23.6M
        WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328
23.6M
        WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329
330
236M
        for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331
213M
        {
332
213M
            if(mask & 1)
333
23.6M
            {
334
23.6M
                *num_cands = *num_cands + 1;
335
23.6M
                cand0->grid_ix = i;
336
23.6M
                cand0->ref_idx = ps_grid->p_ref_idx[i];
337
23.6M
                cand0->pu1_ref_ptr =
338
23.6M
                    pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339
23.6M
                cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340
23.6M
                cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341
23.6M
                cand0++;
342
23.6M
            }
343
213M
        }
344
23.6M
    }
345
346
    /* Loop to compute the SAD's */
347
47.3M
    for(a = 0; a < *num_cands; a++)
348
23.6M
    {
349
23.6M
        cand_t *cand = ps_cand + a;
350
23.6M
        memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351
402M
        for(b = 0; b < NUM_4X4; b++)
352
378M
        {
353
378M
            WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354
378M
            WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355
356
1.17G
            for(c = 0; c < num_rows_in_nxn; c++)
357
799M
            {
358
799M
                WORD32 z_cur = (cur_buf_stride)*c + t1;
359
799M
                WORD32 z_ref = (ref_buf_stride)*c + t2;
360
2.77G
                for(d = 0; d < num_pixels_in_row; d++)
361
1.97G
                {
362
1.97G
                    a_nxn_sad[b] += (WORD32)ABS(
363
1.97G
                        (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364
1.97G
                         ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365
1.97G
                }
366
799M
            }
367
378M
        }
368
369
23.6M
        pp_part_sads[PART_ID_NxN_TL][a] =
370
23.6M
            (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371
23.6M
        pp_part_sads[PART_ID_NxN_TR][a] =
372
23.6M
            (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373
23.6M
        pp_part_sads[PART_ID_NxN_BL][a] =
374
23.6M
            (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375
23.6M
        pp_part_sads[PART_ID_NxN_BR][a] =
376
23.6M
            (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377
23.6M
        pp_part_sads[PART_ID_Nx2N_L][a] =
378
23.6M
            pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379
23.6M
        pp_part_sads[PART_ID_Nx2N_R][a] =
380
23.6M
            pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381
23.6M
        pp_part_sads[PART_ID_2NxN_T][a] =
382
23.6M
            pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383
23.6M
        pp_part_sads[PART_ID_2NxN_B][a] =
384
23.6M
            pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385
23.6M
        pp_part_sads[PART_ID_nLx2N_L][a] =
386
23.6M
            (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387
23.6M
        pp_part_sads[PART_ID_nRx2N_R][a] =
388
23.6M
            (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389
23.6M
        pp_part_sads[PART_ID_2NxnU_T][a] =
390
23.6M
            (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391
23.6M
        pp_part_sads[PART_ID_2NxnD_B][a] =
392
23.6M
            (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393
23.6M
        pp_part_sads[PART_ID_2Nx2N][a] =
394
23.6M
            pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395
23.6M
        pp_part_sads[PART_ID_2NxnU_B][a] =
396
23.6M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397
23.6M
        pp_part_sads[PART_ID_2NxnD_T][a] =
398
23.6M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399
23.6M
        pp_part_sads[PART_ID_nRx2N_L][a] =
400
23.6M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401
23.6M
        pp_part_sads[PART_ID_nLx2N_R][a] =
402
23.6M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403
23.6M
    }
404
23.6M
}
405
406
void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407
0
{
408
0
    grid_ctxt_t s_grid;
409
0
    cand_t as_candt[9];
410
0
    U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411
0
    U16 *apu2_sad_grid[TOT_NUM_PARTS];
412
0
    hme_mv_t s_mv = { 0, 0 };
413
0
    S32 i4_ref_idx = 0, i;
414
0
    S32 num_candts = 0;
415
0
    s_grid.num_grids = 1;
416
0
    s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417
0
    s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418
0
    s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419
0
    s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420
0
    s_grid.p_mv = &s_mv;
421
0
    s_grid.p_ref_idx = &i4_ref_idx;
422
0
    for(i = 0; i < 9; i++)
423
0
    {
424
0
        if(s_grid.pi4_grd_mask[0] & (1 << i))
425
0
            num_candts++;
426
0
    }
427
428
0
    for(i = 0; i < TOT_NUM_PARTS; i++)
429
0
        apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430
431
0
    compute_4x4_sads_for_16x16_blk(
432
0
        &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433
0
    for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434
0
    {
435
0
        ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436
0
    }
437
0
}
438
439
void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440
20.4M
{
441
20.4M
    U08 *pu1_inp_base, *pu1_ref_c;
442
20.4M
    S32 *pi4_sad = ps_prms->pi4_sad_grid;
443
20.4M
    S32 i, grid_count = 0;
444
20.4M
    S32 step = ps_prms->i4_step;
445
20.4M
    S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446
447
20.4M
    ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448
449
    //assert(ps_prms->i4_blk_ht <= 8);
450
    //assert(ps_prms->i4_blk_wd <= 8);
451
204M
    for(i = 0; i < 9; i++)
452
183M
    {
453
183M
        if(ps_prms->i4_grid_mask & (1 << i))
454
174M
            grid_count++;
455
183M
    }
456
20.4M
    pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457
458
20.4M
    pu1_inp_base = ps_prms->pu1_inp;
459
20.4M
    pu1_ref_c = ps_prms->pu1_ref;
460
204M
    for(i = 0; i < 9; i++)
461
183M
    {
462
183M
        S32 sad = 0, j, k;
463
183M
        U08 *pu1_inp, *pu1_ref;
464
465
183M
        if(!(ps_prms->i4_grid_mask & (1 << i)))
466
9.19M
            continue;
467
174M
        pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468
174M
        pu1_ref += y_off * gai1_grid_id_to_y[i];
469
174M
        pu1_inp = pu1_inp_base;
470
471
1.22G
        for(j = 0; j < ps_prms->i4_blk_ht; j++)
472
1.04G
        {
473
6.63G
            for(k = 0; k < ps_prms->i4_blk_wd; k++)
474
5.58G
            {
475
5.58G
                sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476
5.58G
            }
477
1.04G
            pu1_inp += ps_prms->i4_inp_stride;
478
1.04G
            pu1_ref += ps_prms->i4_ref_stride;
479
1.04G
        }
480
174M
        *pi4_sad++ = sad;
481
174M
    }
482
20.4M
}
483
484
WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485
    WORD32 ht,
486
    WORD32 wd,
487
    UWORD8 *pu1_inp,
488
    UWORD8 *pu1_ref,
489
    WORD32 i4_inp_stride,
490
    WORD32 i4_ref_stride)
491
35.5M
{
492
35.5M
    WORD32 i, j;
493
35.5M
    WORD32 sad = 0;
494
299M
    for(i = 0; i < ht; i++)
495
264M
    {
496
2.54G
        for(j = 0; j < wd; j++)
497
2.28G
        {
498
2.28G
            sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499
2.28G
        }
500
264M
        pu1_inp += i4_inp_stride;
501
264M
        pu1_ref += i4_ref_stride;
502
264M
    }
503
35.5M
    return sad;
504
35.5M
}
505
506
void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507
35.5M
{
508
35.5M
    S32 wd, ht;
509
35.5M
    U08 *pu1_inp, *pu1_ref;
510
511
35.5M
    wd = ps_prms->i4_blk_wd;
512
35.5M
    ht = ps_prms->i4_blk_ht;
513
514
35.5M
    pu1_inp = ps_prms->pu1_inp;
515
35.5M
    pu1_ref = ps_prms->pu1_ref;
516
517
35.5M
    ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518
35.5M
        ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519
35.5M
}
520
521
void compute_satd_8bit(err_prms_t *ps_prms)
522
35.5M
{
523
35.5M
    U08 *pu1_origin;
524
35.5M
    S32 src_strd;
525
35.5M
    U08 *pu1_pred_buf;
526
35.5M
    S32 dst_strd;
527
35.5M
    S32 wd, ht;
528
35.5M
    U32 u4_sad = 0;
529
35.5M
    WORD32 x, y;
530
35.5M
    U08 *u1_pi0, *u1_pi1;
531
532
35.5M
    pu1_origin = ps_prms->pu1_inp;
533
35.5M
    pu1_pred_buf = ps_prms->pu1_ref;
534
35.5M
    src_strd = ps_prms->i4_inp_stride;
535
35.5M
    dst_strd = ps_prms->i4_ref_stride;
536
35.5M
    wd = ps_prms->i4_blk_wd;
537
35.5M
    ht = ps_prms->i4_blk_ht;
538
539
35.5M
    u1_pi0 = pu1_origin;
540
35.5M
    u1_pi1 = pu1_pred_buf;
541
542
    /* Follows the following logic:
543
    For block sizes less than or equal to 16X16, the basic transform size is 4x4
544
    For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545
35.5M
    if((wd > 0x10) || (ht > 0x10))
546
2.10M
    {
547
9.13M
        for(y = 0; y < ht; y += 8)
548
7.03M
        {
549
36.4M
            for(x = 0; x < wd; x += 8)
550
29.4M
            {
551
29.4M
                u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552
29.4M
                    &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553
29.4M
            }
554
7.03M
            u1_pi0 += src_strd * 8;
555
7.03M
            u1_pi1 += dst_strd * 8;
556
7.03M
        }
557
2.10M
    }
558
33.4M
    else
559
33.4M
    {
560
125M
        for(y = 0; y < ht; y += 4)
561
92.2M
        {
562
358M
            for(x = 0; x < wd; x += 4)
563
266M
            {
564
266M
                u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565
266M
                    &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566
266M
            }
567
92.2M
            u1_pi0 += src_strd * 4;
568
92.2M
            u1_pi1 += dst_strd * 4;
569
92.2M
        }
570
33.4M
    }
571
572
35.5M
    ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573
35.5M
}
574
575
void hme_init_pred_part(
576
    pred_ctxt_t *ps_pred_ctxt,
577
    search_node_t *ps_tl,
578
    search_node_t *ps_t,
579
    search_node_t *ps_tr,
580
    search_node_t *ps_l,
581
    search_node_t *ps_bl,
582
    search_node_t *ps_coloc,
583
    search_node_t *ps_zeromv,
584
    search_node_t **pps_proj_coloc,
585
    PART_ID_T e_part_id)
586
79.6M
{
587
79.6M
    pred_candt_nodes_t *ps_candt_nodes;
588
589
79.6M
    ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590
591
79.6M
    ps_candt_nodes->ps_tl = ps_tl;
592
79.6M
    ps_candt_nodes->ps_tr = ps_tr;
593
79.6M
    ps_candt_nodes->ps_t = ps_t;
594
79.6M
    ps_candt_nodes->ps_l = ps_l;
595
79.6M
    ps_candt_nodes->ps_bl = ps_bl;
596
79.6M
    ps_candt_nodes->ps_coloc = ps_coloc;
597
79.6M
    ps_candt_nodes->ps_zeromv = ps_zeromv;
598
79.6M
    ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599
79.6M
}
600
601
void hme_init_pred_ctxt_no_encode(
602
    pred_ctxt_t *ps_pred_ctxt,
603
    search_results_t *ps_search_results,
604
    search_node_t *ps_top_candts,
605
    search_node_t *ps_left_candts,
606
    search_node_t **pps_proj_coloc_candts,
607
    search_node_t *ps_coloc_candts,
608
    search_node_t *ps_zeromv_candt,
609
    S32 pred_lx,
610
    S32 lambda,
611
    S32 lambda_q_shift,
612
    U08 **ppu1_ref_bits_tlu,
613
    S16 *pi2_ref_scf)
614
605k
{
615
605k
    search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616
605k
    search_node_t *ps_coloc;
617
605k
    PART_ID_T e_part_id;
618
619
    /* Assume that resolution is subpel to begin with */
620
605k
    ps_pred_ctxt->mv_pel = 0;  // FPEL
621
622
    /* lambda and pred_lx (PRED_L0/PRED_L1) */
623
605k
    ps_pred_ctxt->lambda = lambda;
624
605k
    ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625
605k
    ps_pred_ctxt->pred_lx = pred_lx;
626
605k
    ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627
605k
    ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628
605k
    ps_pred_ctxt->proj_used = 0;
629
630
    /* Bottom left should not be valid */
631
605k
    ASSERT(ps_left_candts[2].u1_is_avail == 0);
632
605k
    ps_invalid = &ps_left_candts[2];
633
634
    /*************************************************************************/
635
    /* for the case of no encode, the idea is to set up cants as follows     */
636
    /*                                                                       */
637
    /*    ____ ______________                                                */
638
    /*   | TL | T  | T1 | TR |                                               */
639
    /*   |____|____|____|____|                                               */
640
    /*   | L  | b0 | b1 |                                                    */
641
    /*   |____|____|____|                                                    */
642
    /*   | L1 | b2 | b3 |                                                    */
643
    /*   |____|____|____|                                                    */
644
    /*   | BL |                                                              */
645
    /*   |____|                                                              */
646
    /*                                                                       */
647
    /*  If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1      */
648
    /* and L=L1. topleft, top and topright are TL,T,TR respectively          */
649
    /* Left and bottom left is L and BL respectively.                        */
650
    /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8)  */
651
    /*  For the 4 subblocks (partids 4-7)                                    */
652
    /*                                                                       */
653
    /*  Block   Left   Top   Top Left   Top Right   Bottom Left             */
654
    /*    b0    L      T      TL          T1          L1                     */
655
    /*    b1    b0     T1     T           TR          BL(invalid)            */
656
    /*    b2    L1     b0     L0          b1          BL (invalid)           */
657
    /*    b3    b2     b1     b0          BL(inv)     BL (inv)               */
658
    /*                                                                       */
659
    /* Note : For block b1, bottom left pts to b2, which is not yet ready    */
660
    /*  hence it is kept invalid and made to pt to BL. For block b3 top rt   */
661
    /* is invalid and hence made to pt to BL which is invalid.               */
662
    /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663
    /*************************************************************************/
664
665
    /* ps_coloc always points to a fixe candt (global) */
666
    /* TODO : replace incoming ps_coloc from global to geniune coloc */
667
605k
    ps_coloc = ps_coloc_candts;
668
669
    /* INITIALIZATION OF 8x8 BLK */
670
605k
    ps_tl = ps_top_candts;
671
605k
    ps_t = ps_tl + 2;
672
605k
    ps_tr = ps_t + 1;
673
605k
    ps_l = ps_left_candts + 1;
674
605k
    ps_bl = ps_invalid;
675
605k
    e_part_id = PART_ID_2Nx2N;
676
605k
    hme_init_pred_part(
677
605k
        ps_pred_ctxt,
678
605k
        ps_tl,
679
605k
        ps_t,
680
605k
        ps_tr,
681
605k
        ps_l,
682
605k
        ps_bl,
683
605k
        ps_coloc,
684
605k
        ps_zeromv_candt,
685
605k
        pps_proj_coloc_candts,
686
605k
        e_part_id);
687
688
    /* INITIALIZATION OF 4x4 TL BLK */
689
605k
    e_part_id = PART_ID_NxN_TL;
690
605k
    ps_tl = ps_top_candts;
691
605k
    ps_t = ps_tl + 1;
692
605k
    ps_tr = ps_t + 1;
693
605k
    ps_l = ps_left_candts;
694
605k
    ps_bl = ps_l + 1;
695
605k
    hme_init_pred_part(
696
605k
        ps_pred_ctxt,
697
605k
        ps_tl,
698
605k
        ps_t,
699
605k
        ps_tr,
700
605k
        ps_l,
701
605k
        ps_bl,
702
605k
        ps_coloc,
703
605k
        ps_zeromv_candt,
704
605k
        pps_proj_coloc_candts,
705
605k
        e_part_id);
706
707
    /* INITIALIZATION OF 4x4 TR BLK */
708
605k
    e_part_id = PART_ID_NxN_TR;
709
605k
    ps_tl = ps_top_candts + 1;
710
605k
    ps_t = ps_tl + 1;
711
605k
    ps_tr = ps_t + 1;
712
605k
    ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713
605k
    ps_bl = ps_invalid;
714
605k
    hme_init_pred_part(
715
605k
        ps_pred_ctxt,
716
605k
        ps_tl,
717
605k
        ps_t,
718
605k
        ps_tr,
719
605k
        ps_l,
720
605k
        ps_bl,
721
605k
        ps_coloc,
722
605k
        ps_zeromv_candt,
723
605k
        pps_proj_coloc_candts,
724
605k
        e_part_id);
725
726
    /* INITIALIZATION OF 4x4 BL BLK */
727
605k
    e_part_id = PART_ID_NxN_BL;
728
605k
    ps_tl = ps_left_candts;
729
605k
    ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730
605k
    ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731
605k
    ps_l = ps_left_candts + 1;
732
605k
    ps_bl = ps_invalid;  //invalid
733
605k
    hme_init_pred_part(
734
605k
        ps_pred_ctxt,
735
605k
        ps_tl,
736
605k
        ps_t,
737
605k
        ps_tr,
738
605k
        ps_l,
739
605k
        ps_bl,
740
605k
        ps_coloc,
741
605k
        ps_zeromv_candt,
742
605k
        pps_proj_coloc_candts,
743
605k
        e_part_id);
744
745
    /* INITIALIZATION OF 4x4 BR BLK */
746
605k
    e_part_id = PART_ID_NxN_BR;
747
605k
    ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748
605k
    ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749
605k
    ps_tr = ps_invalid;  // invalid
750
605k
    ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751
605k
    ps_bl = ps_invalid;  // invalid
752
605k
    hme_init_pred_part(
753
605k
        ps_pred_ctxt,
754
605k
        ps_tl,
755
605k
        ps_t,
756
605k
        ps_tr,
757
605k
        ps_l,
758
605k
        ps_bl,
759
605k
        ps_coloc,
760
605k
        ps_zeromv_candt,
761
605k
        pps_proj_coloc_candts,
762
605k
        e_part_id);
763
605k
}
764
765
void hme_init_pred_ctxt_encode(
766
    pred_ctxt_t *ps_pred_ctxt,
767
    search_results_t *ps_search_results,
768
    search_node_t *ps_coloc_candts,
769
    search_node_t *ps_zeromv_candt,
770
    mv_grid_t *ps_mv_grid,
771
    S32 pred_lx,
772
    S32 lambda,
773
    S32 lambda_q_shift,
774
    U08 **ppu1_ref_bits_tlu,
775
    S16 *pi2_ref_scf)
776
4.50M
{
777
4.50M
    search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778
4.50M
    search_node_t *ps_coloc;
779
4.50M
    search_node_t *ps_grid_cu_base;
780
4.50M
    CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781
782
    /* Part Start, Part sizes in 4x4 units */
783
4.50M
    S32 part_wd, part_ht, part_start_x, part_start_y;
784
785
    /* Partition type, number of partitions in type */
786
4.50M
    S32 part_id;
787
788
    /* Coordinates of the CU in 4x4 units */
789
4.50M
    S32 cu_start_x, cu_start_y;
790
4.50M
    S32 shift = e_cu_size;
791
792
    /* top right and bot left validity at CU level */
793
4.50M
    S32 cu_tr_valid, cu_bl_valid;
794
    /* strideo f the grid */
795
4.50M
    S32 grid_stride = ps_mv_grid->i4_stride;
796
797
4.50M
    ps_pred_ctxt->lambda = lambda;
798
4.50M
    ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799
4.50M
    ps_pred_ctxt->pred_lx = pred_lx;
800
4.50M
    ps_pred_ctxt->mv_pel = 0;
801
4.50M
    ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802
4.50M
    ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803
4.50M
    ps_pred_ctxt->proj_used = 1;
804
805
4.50M
    cu_start_x = ps_search_results->u1_x_off >> 2;
806
4.50M
    cu_start_y = ps_search_results->u1_y_off >> 2;
807
808
    /* Coloc always points to fixed global candt */
809
4.50M
    ps_coloc = ps_coloc_candts;
810
811
    /* Go to base of the CU in the MV Grid */
812
4.50M
    ps_grid_cu_base = &ps_mv_grid->as_node[0];
813
4.50M
    ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814
4.50M
    ps_grid_cu_base += (grid_stride * cu_start_y);
815
816
    /* points to the real bottom left of the grid, will never be valid */
817
4.50M
    ps_invalid = &ps_mv_grid->as_node[0];
818
4.50M
    ps_invalid += (grid_stride * 17);
819
820
4.50M
    {
821
4.50M
        S32 shift = 1 + e_cu_size;
822
4.50M
        cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823
4.50M
        cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824
4.50M
    }
825
826
    /*************************************************************************/
827
    /* for the case of    encode, the idea is to set up cants as follows     */
828
    /*                                                                       */
829
    /*    ____ ______________ ____ ____                                      */
830
    /*   | T0 | T1 | T2 | T3 | T4 | T5 |                                     */
831
    /*   |____|____|____|____|____|____|                                     */
832
    /*   | L1 |    |              |                                          */
833
    /*   |____|    |              |                                          */
834
    /*   | L2 | p0 |     p1       |                                          */
835
    /*   |____|    |              |                                          */
836
    /*   | L3 |    |              |                                          */
837
    /*   |____|    |              |                                          */
838
    /*   | L4 | L' |              |                                          */
839
    /*   |____|____|______________|                                          */
840
    /*   | BL |                                                              */
841
    /*   |____|                                                              */
842
    /*  The example is shown with 16x16 CU, though it can be generalized     */
843
    /*  This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition   */
844
    /*  width and ht in 4x4 units.                                           */
845
    /*  For a given CU, derive the top left, top and bottom left and top rt  */
846
    /*  pts. Left and top are assumed to be valid.                           */
847
    /*  IF there aretwo partitions in the CU (like p0 and p1) and vertical,  */
848
    /*  then for first partition, left, top, top left and top right valid    */
849
    /*  Bottom left is valid. store these validity flags. Also store the     */
850
    /*  grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851
    /*  Left grid offset = -1, 3. Top Grd offset = -1, 0.                    */
852
    /*  Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4.        */
853
    /*  For p1, validity flags are left, top, top left, top right, valid.    */
854
    /*  BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2)    */
855
    /*  TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care.                   */
856
    /*  For p1, set the left pred candt to the best search result of p0.     */
857
    /*************************************************************************/
858
859
    /* Loop over all partitions, and identify the 5 neighbours */
860
81.1M
    for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861
76.6M
    {
862
76.6M
        part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863
76.6M
        S32 tr_valid, bl_valid, is_vert;
864
76.6M
        search_node_t *ps_grid_pu_base;
865
76.6M
        PART_TYPE_T e_part_type;
866
76.6M
        PART_ID_T first_part;
867
76.6M
        S32 part_num;
868
869
76.6M
        e_part_type = ge_part_id_to_part_type[part_id];
870
76.6M
        first_part = ge_part_type_to_part_id[e_part_type][0];
871
76.6M
        is_vert = gau1_is_vert_part[e_part_type];
872
76.6M
        part_num = gau1_part_id_to_part_num[part_id];
873
76.6M
        tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874
76.6M
        bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875
876
76.6M
        part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877
76.6M
        part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878
76.6M
        part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879
76.6M
        part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880
881
        /* go to top left of part */
882
76.6M
        ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883
76.6M
        ps_grid_pu_base += (part_start_y * grid_stride);
884
885
76.6M
        ps_tl = ps_grid_pu_base - 1 - grid_stride;
886
76.6M
        ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887
76.6M
        ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888
76.6M
        ps_tr = ps_t + 1;
889
76.6M
        ps_bl = ps_l + grid_stride;
890
891
76.6M
        if(!tr_valid)
892
34.7M
            ps_tr = ps_invalid;
893
76.6M
        if(!bl_valid)
894
56.0M
            ps_bl = ps_invalid;
895
896
76.6M
        if(part_num == 1)
897
31.5M
        {
898
            /* for cases of two partitions 2nd part has 1st part as candt */
899
            /* if vertical type, left candt of 2nd part is 1st part.      */
900
            /* if horz type, top candt of 2nd part is 1st part.           */
901
31.5M
            if(is_vert)
902
18.0M
            {
903
18.0M
                ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904
18.0M
            }
905
13.5M
            else
906
13.5M
            {
907
13.5M
                ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908
13.5M
            }
909
31.5M
        }
910
76.6M
        if(part_num == 2)
911
4.50M
        {
912
            /* only possible for NxN_BL */
913
4.50M
            ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914
4.50M
            ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915
4.50M
        }
916
76.6M
        if(part_num == 3)
917
4.50M
        {
918
            /* only possible for NxN_BR */
919
4.50M
            ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920
4.50M
            ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921
4.50M
            ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922
4.50M
        }
923
76.6M
        hme_init_pred_part(
924
76.6M
            ps_pred_ctxt,
925
76.6M
            ps_tl,
926
76.6M
            ps_t,
927
76.6M
            ps_tr,
928
76.6M
            ps_l,
929
76.6M
            ps_bl,
930
76.6M
            ps_coloc,
931
76.6M
            ps_zeromv_candt,
932
76.6M
            NULL,
933
76.6M
            (PART_ID_T)part_id);
934
76.6M
    }
935
4.50M
}
936
937
/**
938
********************************************************************************
939
*  @fn     compute_mv_cost_explicit(search_node_t *ps_node,
940
*                   pred_ctxt_t *ps_pred_ctxt,
941
*                   PART_ID_T e_part_id)
942
*
943
*  @brief  MV cost for explicit search in layers not encoded
944
*
945
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
946
*
947
*  @param[in]  ps_pred_ctxt : mv pred context
948
*
949
*  @param[in]  e_part_id : Partition id.
950
*
951
*  @return   Cost value
952
953
********************************************************************************
954
*/
955
S32 compute_mv_cost_explicit(
956
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957
70.9M
{
958
70.9M
#define RETURN_FIXED_COST 0
959
70.9M
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960
70.9M
    pred_candt_nodes_t *ps_pred_nodes;
961
70.9M
    S32 inp_shift = 2 - inp_mv_pel;
962
70.9M
    S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963
70.9M
    S32 mv_p_x, mv_p_y;
964
70.9M
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
965
70.9M
    S32 cost, ref_bits;
966
967
    /*************************************************************************/
968
    /* Logic for cost computation for explicit search. For such a search,    */
969
    /* it is guaranteed that all predictor candts have same ref id. The only */
970
    /* probable issue is with the availability which needs checking. This fxn*/
971
    /* does not suffer the need to scale predictor candts due to diff ref id */
972
    /*************************************************************************/
973
974
    /* Hack: currently we always assume 2Nx2N. */
975
    /* TODO: get rid of this hack and return cost tuned to each partition */
976
70.9M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977
70.9M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978
979
    /*************************************************************************/
980
    /* Priority to bottom left availability. Else we go to left. If both are */
981
    /* not available, then a remains null                                    */
982
    /*************************************************************************/
983
70.9M
    if(ps_pred_nodes->ps_tl->u1_is_avail)
984
41.5M
        ps_pred_node_a = ps_pred_nodes->ps_tl;
985
29.4M
    else if(ps_pred_nodes->ps_l->u1_is_avail)
986
12.7M
        ps_pred_node_a = ps_pred_nodes->ps_l;
987
988
    /*************************************************************************/
989
    /* For encoder, top left may not be really needed unless we use slices,  */
990
    /* and even then in ME it may not be relevant. So we only consider T or  */
991
    /* TR, as, if both T and TR are not available, TL also will not be       */
992
    /*************************************************************************/
993
70.9M
    if(ps_pred_nodes->ps_tr->u1_is_avail)
994
41.2M
        ps_pred_node_b = ps_pred_nodes->ps_tr;
995
29.7M
    else if(ps_pred_nodes->ps_t->u1_is_avail)
996
13.0M
        ps_pred_node_b = ps_pred_nodes->ps_t;
997
998
70.9M
    if(ps_pred_node_a == NULL)
999
16.6M
    {
1000
16.6M
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001
16.6M
        if(ps_pred_node_b == NULL)
1002
3.92M
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003
16.6M
    }
1004
54.2M
    else if(ps_pred_node_b == NULL)
1005
12.7M
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006
41.5M
    else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007
37.2M
    {
1008
37.2M
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009
37.2M
    }
1010
1011
70.9M
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012
70.9M
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013
70.9M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014
70.9M
    mvdx1 = ABS(mvdx1);
1015
70.9M
    mvdy1 = ABS(mvdy1);
1016
1017
70.9M
    mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018
70.9M
    mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019
70.9M
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020
70.9M
    mvdx2 = ABS(mvdx2);
1021
70.9M
    mvdy2 = ABS(mvdy2);
1022
1023
70.9M
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024
7.12M
    {
1025
7.12M
        cost =
1026
7.12M
            hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027
7.12M
    }
1028
63.8M
    else
1029
63.8M
    {
1030
63.8M
        cost =
1031
63.8M
            hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032
63.8M
    }
1033
70.9M
    {
1034
70.9M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035
70.9M
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036
70.9M
    }
1037
70.9M
}
1038
/**
1039
********************************************************************************
1040
*  @fn     compute_mv_cost_coarse(search_node_t *ps_node,
1041
*                   pred_ctxt_t *ps_pred_ctxt,
1042
*                   PART_ID_T e_part_id)
1043
*
1044
*  @brief  MV cost for coarse explicit search in coarsest layer
1045
*
1046
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1047
*
1048
*  @param[in]  ps_pred_ctxt : mv pred context
1049
*
1050
*  @param[in]  e_part_id : Partition id.
1051
*
1052
*  @return   Cost value
1053
1054
********************************************************************************
1055
*/
1056
S32 compute_mv_cost_coarse(
1057
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058
70.9M
{
1059
70.9M
    ARG_NOT_USED(e_part_id);
1060
1061
70.9M
    return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062
70.9M
}
1063
1064
/**
1065
********************************************************************************
1066
*  @fn     compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067
*                                            pred_ctxt_t *ps_pred_ctxt,
1068
*                                            PART_ID_T e_part_id)
1069
*
1070
*  @brief  MV cost for coarse explicit search in coarsest layer
1071
*
1072
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1073
*
1074
*  @param[in]  ps_pred_ctxt : mv pred context
1075
*
1076
*  @param[in]  e_part_id : Partition id.
1077
*
1078
*  @return   Cost value
1079
1080
********************************************************************************
1081
*/
1082
S32 compute_mv_cost_coarse_high_speed(
1083
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084
597M
{
1085
597M
    S32 rnd, mvx, mvy, i4_search_idx;
1086
597M
    S32 cost;
1087
1088
597M
    mvx = ps_node->s_mv.i2_mvx;
1089
597M
    mvy = ps_node->s_mv.i2_mvy;
1090
597M
    i4_search_idx = ps_node->i1_ref_idx;
1091
1092
597M
    cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093
597M
    cost += (mvx != 0) ? 1 : 0;
1094
597M
    cost += (mvy != 0) ? 1 : 0;
1095
597M
    rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096
597M
    cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097
597M
    return cost;
1098
597M
}
1099
1100
/**
1101
********************************************************************************
1102
*  @fn     compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103
*                                          pred_ctxt_t *ps_pred_ctxt,
1104
*                                          PART_ID_T e_part_id)
1105
*
1106
*  @brief  MV cost for explicit search in layers not encoded. Always returns
1107
*          cost of the projected colocated candidate
1108
*
1109
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1110
*
1111
*  @param[in]  ps_pred_ctxt : mv pred context
1112
*
1113
*  @param[in]  e_part_id : Partition id.
1114
*
1115
*  @return   Cost value
1116
1117
********************************************************************************
1118
*/
1119
S32 compute_mv_cost_explicit_refine(
1120
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121
132M
{
1122
132M
    search_node_t *ps_pred_node_a = NULL;
1123
132M
    pred_candt_nodes_t *ps_pred_nodes;
1124
132M
    S32 inp_shift = 2 - inp_mv_pel;
1125
132M
    S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126
132M
    S32 mv_p_x, mv_p_y;
1127
132M
    S16 mvdx1, mvdy1;
1128
132M
    S32 cost, ref_bits;
1129
1130
132M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131
132M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132
1133
132M
    ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134
1135
132M
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136
132M
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137
132M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138
132M
    mvdx1 = ABS(mvdx1);
1139
132M
    mvdy1 = ABS(mvdy1);
1140
1141
132M
    cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142
1143
132M
    {
1144
132M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145
132M
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146
132M
    }
1147
132M
}
1148
1149
/**
1150
********************************************************************************
1151
*  @fn     compute_mv_cost_refine(search_node_t *ps_node,
1152
*                   pred_ctxt_t *ps_pred_ctxt,
1153
*                   PART_ID_T e_part_id)
1154
*
1155
*  @brief  MV cost for coarse explicit search in coarsest layer
1156
*
1157
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1158
*
1159
*  @param[in]  ps_pred_ctxt : mv pred context
1160
*
1161
*  @param[in]  e_part_id : Partition id.
1162
*
1163
*  @return   Cost value
1164
1165
********************************************************************************
1166
*/
1167
S32 compute_mv_cost_refine(
1168
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169
132M
{
1170
132M
    return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171
132M
}
1172
1173
S32 compute_mv_cost_implicit(
1174
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175
0
{
1176
0
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177
0
    pred_candt_nodes_t *ps_pred_nodes;
1178
0
    S08 i1_ref_idx;
1179
0
    S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180
0
    S08 i1_ref_bl = -1, i1_ref_l = -1;
1181
0
    S32 inp_shift = 2 - inp_mv_pel;
1182
0
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183
0
    S32 ref_bits, cost;
1184
0
    S32 mv_p_x, mv_p_y;
1185
0
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186
1187
    //return 0;
1188
0
    i1_ref_idx = ps_node->i1_ref_idx;
1189
1190
    /*************************************************************************/
1191
    /* Logic for cost computation for explicit search. For such a search,    */
1192
    /* it is guaranteed that all predictor candts have same ref id. The only */
1193
    /* probable issue is with the availability which needs checking. This fxn*/
1194
    /* does not suffer the need to scale predictor candts due to diff ref id */
1195
    /*************************************************************************/
1196
1197
0
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198
0
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199
1200
    /*************************************************************************/
1201
    /* Priority to bottom left availability. Else we go to left. If both are */
1202
    /* not available, then a remains null                                    */
1203
    /*************************************************************************/
1204
0
    if(ps_pred_nodes->ps_bl->u1_is_avail)
1205
0
        i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206
0
    if(ps_pred_nodes->ps_l->u1_is_avail)
1207
0
        i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208
0
    if(i1_ref_bl == i1_ref_idx)
1209
0
        ps_pred_node_a = ps_pred_nodes->ps_bl;
1210
0
    else if(i1_ref_l == i1_ref_idx)
1211
0
        ps_pred_node_a = ps_pred_nodes->ps_l;
1212
0
    if(ps_pred_node_a == NULL)
1213
0
    {
1214
0
        if(i1_ref_bl != -1)
1215
0
            ps_pred_node_a = ps_pred_nodes->ps_bl;
1216
0
        else if(i1_ref_l != -1)
1217
0
            ps_pred_node_a = ps_pred_nodes->ps_l;
1218
0
    }
1219
1220
    /*************************************************************************/
1221
    /* For encoder, top left may not be really needed unless we use slices,  */
1222
    /* and even then in ME it may not be relevant. So we only consider T or  */
1223
    /* TR, as, if both T and TR are not available, TL also will not be       */
1224
    /*************************************************************************/
1225
0
    if(ps_pred_nodes->ps_tr->u1_is_avail)
1226
0
        i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227
0
    if(ps_pred_nodes->ps_t->u1_is_avail)
1228
0
        i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229
0
    if(ps_pred_nodes->ps_tl->u1_is_avail)
1230
0
        i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231
0
    if(i1_ref_tr == i1_ref_idx)
1232
0
        ps_pred_node_b = ps_pred_nodes->ps_tr;
1233
0
    else if(i1_ref_t == i1_ref_idx)
1234
0
        ps_pred_node_b = ps_pred_nodes->ps_t;
1235
0
    else if(i1_ref_tl == i1_ref_idx)
1236
0
        ps_pred_node_b = ps_pred_nodes->ps_tl;
1237
1238
0
    if(ps_pred_node_b == NULL)
1239
0
    {
1240
0
        if(i1_ref_tr != -1)
1241
0
            ps_pred_node_b = ps_pred_nodes->ps_tr;
1242
0
        else if(i1_ref_t != -1)
1243
0
            ps_pred_node_b = ps_pred_nodes->ps_t;
1244
0
        else if(i1_ref_tl != -1)
1245
0
            ps_pred_node_b = ps_pred_nodes->ps_tl;
1246
0
    }
1247
0
    if(ps_pred_node_a == NULL)
1248
0
    {
1249
0
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250
0
        if(ps_pred_node_b == NULL)
1251
0
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252
0
    }
1253
0
    else if(ps_pred_node_b == NULL)
1254
0
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255
0
    else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256
0
    {
1257
0
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258
0
    }
1259
1260
0
    if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261
0
    {
1262
0
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263
0
    }
1264
0
    else
1265
0
    {
1266
0
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267
0
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268
0
    }
1269
0
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270
0
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271
0
    mvdx1 = ABS(mvdx1);
1272
0
    mvdy1 = ABS(mvdy1);
1273
1274
0
    if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275
0
    {
1276
0
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277
0
    }
1278
0
    else
1279
0
    {
1280
0
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281
0
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282
0
    }
1283
0
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284
0
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285
0
    mvdx2 = ABS(mvdx2);
1286
0
    mvdy2 = ABS(mvdy2);
1287
1288
0
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289
0
    {
1290
0
        cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291
0
               2 * (mvdy1 > 0) + ref_bits + 2;
1292
0
    }
1293
0
    else
1294
0
    {
1295
0
        cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296
0
               2 * (mvdy2 > 0) + ref_bits + 2;
1297
0
    }
1298
0
    {
1299
        /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300
0
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301
0
        S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302
1303
0
        tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304
0
        return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305
0
    }
1306
0
}
1307
1308
S32 compute_mv_cost_implicit_high_speed(
1309
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310
4.37M
{
1311
4.37M
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312
4.37M
    pred_candt_nodes_t *ps_pred_nodes;
1313
4.37M
    S08 i1_ref_idx;
1314
4.37M
    S08 i1_ref_tr = -1;
1315
4.37M
    S08 i1_ref_l = -1;
1316
4.37M
    S32 inp_shift = 2 - inp_mv_pel;
1317
4.37M
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318
4.37M
    S32 ref_bits, cost;
1319
4.37M
    S32 mv_p_x, mv_p_y;
1320
4.37M
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321
1322
4.37M
    i1_ref_idx = ps_node->i1_ref_idx;
1323
1324
4.37M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325
4.37M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326
1327
    /*************************************************************************/
1328
    /* Priority to bottom left availability. Else we go to left. If both are */
1329
    /* not available, then a remains null                                    */
1330
    /*************************************************************************/
1331
4.37M
    if(ps_pred_nodes->ps_l->u1_is_avail)
1332
2.58M
    {
1333
2.58M
        i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334
2.58M
        ps_pred_node_a = ps_pred_nodes->ps_l;
1335
2.58M
    }
1336
1337
    /*************************************************************************/
1338
    /* For encoder, top left may not be really needed unless we use slices,  */
1339
    /* and even then in ME it may not be relevant. So we only consider T or  */
1340
    /* TR, as, if both T and TR are not available, TL also will not be       */
1341
    /*************************************************************************/
1342
1343
4.37M
    if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344
765k
    {
1345
765k
        i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346
765k
        ps_pred_node_b = ps_pred_nodes->ps_tr;
1347
765k
    }
1348
3.61M
    else
1349
3.61M
    {
1350
3.61M
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351
3.61M
    }
1352
1353
4.37M
    if(ps_pred_node_a == NULL)
1354
1.79M
    {
1355
1.79M
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356
1357
1.79M
        if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358
1.26M
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359
1.79M
    }
1360
1361
4.37M
    if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362
618k
    {
1363
618k
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364
618k
    }
1365
3.75M
    else
1366
3.75M
    {
1367
3.75M
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368
3.75M
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369
3.75M
    }
1370
1371
4.37M
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372
4.37M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373
4.37M
    mvdx1 = ABS(mvdx1);
1374
4.37M
    mvdy1 = ABS(mvdy1);
1375
1376
4.37M
    if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377
957k
    {
1378
957k
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379
957k
    }
1380
3.41M
    else
1381
3.41M
    {
1382
3.41M
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383
3.41M
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384
3.41M
    }
1385
1386
4.37M
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387
4.37M
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388
4.37M
    mvdx2 = ABS(mvdx2);
1389
4.37M
    mvdy2 = ABS(mvdy2);
1390
1391
4.37M
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392
661k
    {
1393
661k
        cost =
1394
661k
            hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395
661k
    }
1396
3.71M
    else
1397
3.71M
    {
1398
3.71M
        cost =
1399
3.71M
            hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400
3.71M
    }
1401
4.37M
    {
1402
        /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403
4.37M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404
4.37M
        S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405
1406
4.37M
        return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407
4.37M
    }
1408
4.37M
}
1409
1410
S32 compute_mv_cost_implicit_high_speed_modified(
1411
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412
0
{
1413
0
    search_node_t *ps_pred_node_a = NULL;
1414
0
    pred_candt_nodes_t *ps_pred_nodes;
1415
0
    S32 inp_shift = 2 - inp_mv_pel;
1416
0
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417
0
    S32 mv_p_x, mv_p_y;
1418
0
    S16 mvdx1, mvdy1;
1419
0
    S32 cost, ref_bits;
1420
1421
0
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422
0
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423
1424
0
    ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425
1426
0
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427
0
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428
0
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429
0
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430
0
    mvdx1 = ABS(mvdx1);
1431
0
    mvdy1 = ABS(mvdy1);
1432
1433
0
    cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434
1435
0
    {
1436
0
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437
0
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438
0
    }
1439
0
}
1440
1441
void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442
3.55M
{
1443
    /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444
1445
3.55M
    search_node_t s_search_node_grid;
1446
3.55M
    const search_node_t *ps_search_node_base;
1447
3.55M
    search_node_t *ps_search_node_grid, *ps_best_node;
1448
3.55M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449
3.55M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450
3.55M
    search_results_t *ps_search_results;
1451
3.55M
    S32 *pi4_valid_part_ids;
1452
3.55M
    S32 i4_step = ps_result_prms->i4_step;
1453
3.55M
    S32 i4_grid_mask, i, i4_min_id;
1454
3.55M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455
3.55M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456
3.55M
    S32 grid_count = 0;
1457
3.55M
    S32 pred_lx;
1458
1459
3.55M
    i4_min_id = (S32)PT_C;
1460
3.55M
    i4_min_cost = MAX_32BIT_VAL;
1461
3.55M
    ps_search_node_grid = &s_search_node_grid;
1462
3.55M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1463
3.55M
    *ps_search_node_grid = *ps_search_node_base;
1464
3.55M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465
3.55M
    ps_search_results = ps_result_prms->ps_search_results;
1466
3.55M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1467
3.55M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1468
1469
35.5M
    for(i = 0; i < 9; i++)
1470
31.9M
    {
1471
31.9M
        if(i4_grid_mask & (1 << i))
1472
30.1M
            grid_count++;
1473
31.9M
    }
1474
1475
    /* Some basic assumptions: only single pt, only part updates */
1476
    /* and more than 1 best result to be computed.               */
1477
    //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478
    //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479
    //ASSERT(ps_search_results->num_results > 1);
1480
1481
3.55M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482
3.55M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483
1484
    /*************************************************************************/
1485
    /* Supposing we do hte result update for a unique partid, we can */
1486
    /* store the best pt id in the grid and also min cost is return */
1487
    /* param. This will be useful for early exit cases.             */
1488
    /* TODO : once we have separate fxn for unique part+grid, we can */
1489
    /* do away with this code here                                   */
1490
    /*************************************************************************/
1491
    //if (pi4_valid_part_ids[1] == -1)
1492
3.55M
    i4_unique_id = pi4_valid_part_ids[0];
1493
1494
    /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495
    /* this till we encounter -1. This is easier than having to       */
1496
    /* figure out part by part, besides, active part decision is      */
1497
    /* usually fixed for a given duration of search, e.g. entire fpel */
1498
    /* refinement for a blk/cu will use fixed valid part mask         */
1499
3.55M
    id = pi4_valid_part_ids[0];
1500
1501
    /*****************************************************************/
1502
    /* points to the best search results corresponding to this       */
1503
    /* specific part type.                                           */
1504
    /*****************************************************************/
1505
3.55M
    ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506
1507
    /*************************************************************************/
1508
    /* Outer loop runs through all active pts in the grid                    */
1509
    /*************************************************************************/
1510
35.5M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511
31.9M
    {
1512
31.9M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513
1.81M
            continue;
1514
1515
        /* For the pt in the grid, update mvx and y depending on */
1516
        /* location of pt. Updates are in FPEL units.            */
1517
30.1M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518
30.1M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519
30.1M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520
30.1M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521
1522
30.1M
        {
1523
            /* evaluate mv cost and totalcost for this part for this given mv*/
1524
30.1M
            i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525
30.1M
                ps_search_node_grid,
1526
30.1M
                &ps_search_results->as_pred_ctxt[pred_lx],
1527
30.1M
                (PART_ID_T)id,
1528
30.1M
                MV_RES_FPEL);
1529
1530
30.1M
            i4_sad = pi4_sad_grid[grid_count * id];
1531
30.1M
            i4_tot_cost = i4_sad + i4_mv_cost;
1532
1533
30.1M
            ASSERT(i4_unique_id == id);
1534
30.1M
            ASSERT(num_results == 1);
1535
1536
            /*****************************************************************/
1537
            /* We do not labor through the results if the total cost worse   */
1538
            /* than the last of the results.                                 */
1539
            /*****************************************************************/
1540
30.1M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541
1.85M
            {
1542
1.85M
                i4_min_id = i4_grid_pt;
1543
1.85M
                ps_result_prms->i4_min_cost = i4_tot_cost;
1544
1545
1.85M
                ps_best_node[0] = *ps_search_node_grid;
1546
1.85M
                ps_best_node[0].i4_sad = i4_sad;
1547
1.85M
                ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548
1.85M
                ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549
1.85M
            }
1550
30.1M
        }
1551
0
        pi4_sad_grid++;
1552
30.1M
    }
1553
3.55M
    ps_result_prms->i4_min_id = i4_min_id;
1554
3.55M
}
1555
1556
void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557
34.7M
{
1558
34.7M
    search_node_t s_search_node_grid;
1559
34.7M
    const search_node_t *ps_search_node_base;
1560
34.7M
    search_node_t *ps_search_node_grid, *ps_best_node;
1561
34.7M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562
34.7M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563
34.7M
    search_results_t *ps_search_results;
1564
34.7M
    S32 *pi4_valid_part_ids;
1565
34.7M
    S32 i4_step = ps_result_prms->i4_step;
1566
34.7M
    S32 i4_grid_mask, i4_count, i, i4_min_id;
1567
34.7M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568
34.7M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569
34.7M
    S32 grid_count = 0;
1570
34.7M
    S32 pred_lx;
1571
1572
34.7M
    i4_min_id = (S32)PT_C;
1573
34.7M
    i4_min_cost = MAX_32BIT_VAL;
1574
34.7M
    ps_search_node_grid = &s_search_node_grid;
1575
34.7M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1576
34.7M
    *ps_search_node_grid = *ps_search_node_base;
1577
34.7M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578
34.7M
    ps_search_results = ps_result_prms->ps_search_results;
1579
34.7M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1580
34.7M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1581
1582
347M
    for(i = 0; i < 9; i++)
1583
312M
    {
1584
312M
        if(i4_grid_mask & (1 << i))
1585
162M
        {
1586
162M
            grid_count++;
1587
162M
        }
1588
312M
    }
1589
1590
34.7M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591
34.7M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592
1593
34.7M
    i4_unique_id = pi4_valid_part_ids[0];
1594
1595
    /*************************************************************************/
1596
    /* Outer loop runs through all active pts in the grid                    */
1597
    /*************************************************************************/
1598
347M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599
312M
    {
1600
312M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601
150M
        {
1602
150M
            continue;
1603
150M
        }
1604
1605
        /* For the pt in the grid, update mvx and y depending on */
1606
        /* location of pt. Updates are in FPEL units.            */
1607
162M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608
162M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609
162M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610
162M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611
1612
162M
        i4_count = 0;
1613
1614
324M
        while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615
162M
        {
1616
            /*****************************************************************/
1617
            /* points to the best search results corresponding to this       */
1618
            /* specific part type.                                           */
1619
            /*****************************************************************/
1620
162M
            ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621
1622
            /* evaluate mv cost and totalcost for this part for this given mv*/
1623
162M
            i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624
162M
                ps_search_node_grid,
1625
162M
                &ps_search_results->as_pred_ctxt[pred_lx],
1626
162M
                (PART_ID_T)id,
1627
162M
                MV_RES_FPEL);
1628
1629
162M
            i4_sad = pi4_sad_grid[grid_count * id];
1630
162M
            i4_tot_cost = i4_sad + i4_mv_cost;
1631
1632
162M
            if(i4_unique_id == id)
1633
162M
            {
1634
162M
                if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635
39.6M
                {
1636
39.6M
                    i4_min_id = i4_grid_pt;
1637
39.6M
                    ps_result_prms->i4_min_cost = i4_tot_cost;
1638
39.6M
                }
1639
162M
            }
1640
1641
162M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642
6.89M
            {
1643
6.89M
                for(i = 0; i < num_results - 1; i++)
1644
0
                {
1645
0
                    if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646
0
                    {
1647
0
                        memmove(
1648
0
                            ps_best_node + i + 1,
1649
0
                            ps_best_node + i,
1650
0
                            sizeof(search_node_t) * (num_results - 1 - i));
1651
0
                        break;
1652
0
                    }
1653
0
                    else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654
0
                    {
1655
0
                        if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656
0
                            break;
1657
0
                    }
1658
0
                }
1659
6.89M
                ps_best_node[i] = *ps_search_node_grid;
1660
6.89M
                ps_best_node[i].i4_sad = i4_sad;
1661
6.89M
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662
6.89M
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663
6.89M
            }
1664
162M
            i4_count++;
1665
162M
        }
1666
162M
        pi4_sad_grid++;
1667
162M
    }
1668
34.7M
    ps_result_prms->i4_min_id = i4_min_id;
1669
34.7M
}
1670
1671
/**
1672
********************************************************************************
1673
*  @fn     hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674
*
1675
*  @brief  Updates results for the case where 1 best result is to be updated
1676
*          for a given pt, for several parts
1677
*          Note : The function is replicated for CLIPing the cost to 16bit to make
1678
*                  bit match with SIMD version
1679
*
1680
*  @param[in]  result_upd_prms_t : Contains the input parameters to this fxn
1681
*
1682
*  @return   The result_upd_prms_t structure is updated for all the active
1683
*            parts in case the current candt has results for any given part
1684
*             that is the best result for that part
1685
********************************************************************************
1686
*/
1687
void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688
39.7M
{
1689
39.7M
    search_node_t s_search_node_grid;
1690
39.7M
    const search_node_t *ps_search_node_base;
1691
39.7M
    search_node_t *ps_search_node_grid, *ps_best_node;
1692
39.7M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693
39.7M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694
39.7M
    search_results_t *ps_search_results;
1695
39.7M
    S32 *pi4_valid_part_ids;
1696
39.7M
    S32 i4_step = ps_result_prms->i4_step;
1697
39.7M
    S32 i4_grid_mask, i4_count, i, i4_min_id;
1698
39.7M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699
39.7M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700
39.7M
    S32 grid_count = 0;
1701
39.7M
    S32 pred_lx;
1702
1703
39.7M
    i4_min_id = (S32)PT_C;
1704
39.7M
    i4_min_cost = MAX_32BIT_VAL;
1705
39.7M
    ps_search_node_grid = &s_search_node_grid;
1706
39.7M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1707
39.7M
    *ps_search_node_grid = *ps_search_node_base;
1708
39.7M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709
39.7M
    ps_search_results = ps_result_prms->ps_search_results;
1710
39.7M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1711
39.7M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1712
1713
397M
    for(i = 0; i < 9; i++)
1714
357M
    {
1715
357M
        if(i4_grid_mask & (1 << i))
1716
39.7M
            grid_count++;
1717
357M
    }
1718
1719
    /* Some basic assumptions: only single pt, only part updates */
1720
    /* and more than 1 best result to be computed.               */
1721
1722
39.7M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723
39.7M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724
1725
    /*************************************************************************/
1726
    /* Supposing we do hte result update for a unique partid, we can */
1727
    /* store the best pt id in the grid and also min cost is return */
1728
    /* param. This will be useful for early exit cases.             */
1729
    /* TODO : once we have separate fxn for unique part+grid, we can */
1730
    /* do away with this code here                                   */
1731
    /*************************************************************************/
1732
    //if (pi4_valid_part_ids[1] == -1)
1733
39.7M
    i4_unique_id = pi4_valid_part_ids[0];
1734
1735
    /*************************************************************************/
1736
    /* Outer loop runs through all active pts in the grid                    */
1737
    /*************************************************************************/
1738
397M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739
357M
    {
1740
357M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741
317M
            continue;
1742
1743
        /* For the pt in the grid, update mvx and y depending on */
1744
        /* location of pt. Updates are in FPEL units.            */
1745
39.7M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746
39.7M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747
39.7M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748
39.7M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749
1750
39.7M
        i4_count = 0;
1751
1752
        /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753
        /* this till we encounter -1. This is easier than having to       */
1754
        /* figure out part by part, besides, active part decision is      */
1755
        /* usually fixed for a given duration of search, e.g. entire fpel */
1756
        /* refinement for a blk/cu will use fixed valid part mask         */
1757
1758
172M
        while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759
132M
        {
1760
            //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761
1762
            /*****************************************************************/
1763
            /* points to the best search results corresponding to this       */
1764
            /* specific part type.                                           */
1765
            /*****************************************************************/
1766
132M
            ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767
1768
            /* evaluate mv cost and totalcost for this part for this given mv*/
1769
132M
            i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770
132M
                ps_search_node_grid,
1771
132M
                &ps_search_results->as_pred_ctxt[pred_lx],
1772
132M
                (PART_ID_T)id,
1773
132M
                MV_RES_FPEL);
1774
1775
132M
            i4_sad = pi4_sad_grid[grid_count * id];
1776
1777
            /* Clipping to 16 bit to bit match with SIMD version */
1778
132M
            i4_mv_cost = CLIP_S16(i4_mv_cost);
1779
132M
            i4_sad = CLIP_S16(i4_sad);
1780
1781
132M
            i4_tot_cost = i4_sad + i4_mv_cost;
1782
            /* Clipping to 16 bit to bit match with SIMD version */
1783
132M
            i4_tot_cost = CLIP_S16(i4_tot_cost);
1784
1785
132M
            if(i4_unique_id == id)
1786
39.7M
            {
1787
39.7M
                if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788
39.7M
                {
1789
39.7M
                    i4_min_id = i4_grid_pt;
1790
39.7M
                    ps_result_prms->i4_min_cost = i4_tot_cost;
1791
39.7M
                }
1792
39.7M
            }
1793
1794
            /*****************************************************************/
1795
            /* We do not labor through the results if the total cost worse   */
1796
            /* than the last of the results.                                 */
1797
            /*****************************************************************/
1798
132M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799
36.8M
            {
1800
                /*************************************************************/
1801
                /* Identify where the current result isto be placed.Basically*/
1802
                /* find the node which has cost just higher thannodeundertest*/
1803
                /*************************************************************/
1804
38.1M
                for(i = 0; i < num_results - 1; i++)
1805
13.7M
                {
1806
13.7M
                    if(i4_tot_cost <= ps_best_node[i].i4_tot_cost)
1807
12.4M
                    {
1808
12.4M
                        memmove(
1809
12.4M
                            ps_best_node + i + 1,
1810
12.4M
                            ps_best_node + i,
1811
12.4M
                            sizeof(search_node_t) * (num_results - 1 - i));
1812
12.4M
                        break;
1813
12.4M
                    }
1814
13.7M
                }
1815
36.8M
                ps_best_node[i] = *ps_search_node_grid;
1816
36.8M
                ps_best_node[i].i4_sad = i4_sad;
1817
36.8M
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1818
36.8M
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1819
36.8M
            }
1820
132M
            i4_count++;
1821
132M
        }
1822
39.7M
        pi4_sad_grid++;
1823
39.7M
    }
1824
39.7M
    ps_result_prms->i4_min_id = i4_min_id;
1825
39.7M
}
1826
1827
/**
1828
********************************************************************************
1829
*  @fn     hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1830
*
1831
*  @brief  Updates results for the case where 1 best result is to be updated
1832
*          for a given pt, for several parts
1833
*
1834
*  @param[in]  ps_result_prms. Contains the input parameters to this fxn
1835
*              ::ps_pred_info : contains cost fxn ptr and predictor info
1836
*              ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1837
*              ::ps_search_results: Search results structure
1838
*              ::i1_ref_id : Reference index
1839
*              ::i4_grid_mask: Dont Care for this fxn
1840
*              ::pi4_valid_part_ids : valid part ids
1841
*              ::ps_search_node_base: Contains the centre pt candt info.
1842
*
1843
*  @return   The ps_search_results structure is updated for all the active
1844
*            parts in case the current candt has results for any given part
1845
*             that is the best result for that part
1846
********************************************************************************
1847
*/
1848
1849
void hme_update_results_pt_pu_best1_subpel_hs(
1850
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1851
911k
{
1852
911k
    search_node_t *ps_search_node_base, *ps_best_node;
1853
911k
    search_results_t *ps_search_results;
1854
911k
    S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1855
911k
    S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1856
911k
    S32 num_results, i;
1857
911k
    S32 *pi4_valid_part_ids;
1858
1859
911k
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1860
    /* Some basic assumptions: only single pt, only part updates */
1861
    /* and more than 1 best result to be computed.               */
1862
911k
    ASSERT(ps_result_prms->i4_grid_mask == 1);
1863
1864
911k
    ps_search_results = ps_result_prms->ps_search_results;
1865
911k
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1866
1867
    /* Compute mv cost, total cost */
1868
911k
    ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1869
1870
8.56M
    while((id = pi4_valid_part_ids[i4_count]) >= 0)
1871
7.65M
    {
1872
7.65M
        S32 update_required = 1;
1873
1874
7.65M
        ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1875
        /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1876
7.65M
        i4_mv_cost = ps_best_node->i4_mv_cost;
1877
7.65M
        i4_sad = ps_result_prms->pi4_sad_grid[id];
1878
7.65M
        i4_tot_cost = i4_sad + i4_mv_cost;
1879
1880
        /* We do not labor through the results if the total cost is worse than   */
1881
        /* the last of the results.                                              */
1882
7.65M
        if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1883
4.81M
        {
1884
            /* Identify where the current result is to be placed. Basically find  */
1885
            /* the node which has cost just higher than node under test           */
1886
4.81M
            for(i = 0; i < num_results - 1; i++)
1887
0
            {
1888
0
                if(ps_best_node[i].i1_ref_idx != -1)
1889
0
                {
1890
0
                    if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1891
0
                    {
1892
0
                        memmove(
1893
0
                            ps_best_node + i + 1,
1894
0
                            ps_best_node + i,
1895
0
                            sizeof(search_node_t) * (num_results - 1 - i));
1896
0
                        break;
1897
0
                    }
1898
0
                    else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1899
0
                    {
1900
0
                        update_required = 0;
1901
0
                        break;
1902
0
                    }
1903
0
                }
1904
0
                else
1905
0
                {
1906
0
                    break;
1907
0
                }
1908
0
            }
1909
1910
4.81M
            if(update_required)
1911
4.81M
            {
1912
                /* Update when either ref_idx or mv's are different */
1913
4.81M
                ps_best_node[i] = *ps_search_node_base;
1914
4.81M
                ps_best_node[i].i4_sad = i4_sad;
1915
4.81M
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1916
4.81M
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1917
4.81M
            }
1918
4.81M
        }
1919
7.65M
        i4_count++;
1920
7.65M
    }
1921
911k
}
1922
1923
void hme_update_results_pt_pu_best1_subpel_hs_1(
1924
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1925
0
{
1926
0
    search_node_t *ps_search_node_base, *ps_best_node;
1927
0
    search_results_t *ps_search_results;
1928
0
    S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1929
0
    S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1930
0
    S32 num_results;
1931
0
    S32 *pi4_valid_part_ids;
1932
1933
0
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1934
    /* Some basic assumptions: only single pt, only part updates */
1935
    /* and more than 1 best result to be computed.               */
1936
0
    ASSERT(ps_result_prms->i4_grid_mask == 1);
1937
1938
0
    ps_search_results = ps_result_prms->ps_search_results;
1939
0
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1940
1941
    /* Compute mv cost, total cost */
1942
0
    ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1943
1944
0
    while((id = pi4_valid_part_ids[i4_count]) >= 0)
1945
0
    {
1946
0
        S32 update_required = 0;
1947
1948
0
        ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1949
        /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1950
0
        i4_mv_cost = ps_best_node->i4_mv_cost;
1951
0
        i4_sad = ps_result_prms->pi4_sad_grid[id];
1952
0
        i4_tot_cost = i4_sad + i4_mv_cost;
1953
1954
        /* We do not labor through the results if the total cost is worse than   */
1955
        /* the last of the results.                                              */
1956
0
        if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1957
0
        {
1958
0
            S32 sdi_value = 0;
1959
1960
0
            update_required = 2;
1961
            /* Identify where the current result is to be placed. Basically find  */
1962
            /* the node which has cost just higher than node under test           */
1963
0
            {
1964
0
                if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1965
0
                {
1966
0
                    update_required = 1;
1967
0
                    sdi_value = ps_best_node[0].i4_sad - i4_sad;
1968
0
                }
1969
0
                else if(
1970
0
                    (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1971
0
                    (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1972
0
                    (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1973
0
                {
1974
0
                    update_required = 0;
1975
0
                }
1976
0
            }
1977
0
            if(update_required == 2)
1978
0
            {
1979
0
                subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1980
1981
0
                ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1982
0
                ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1983
0
                ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1984
0
                ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1985
0
                ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
1986
0
            }
1987
0
            else if(update_required == 1)
1988
0
            {
1989
0
                subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1990
1991
0
                ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
1992
0
                    ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
1993
0
                ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
1994
0
                    ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
1995
0
                ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
1996
0
                    ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
1997
0
                ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
1998
0
                    ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
1999
0
                ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2000
0
                    ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2001
2002
0
                ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2003
0
                ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2004
0
                ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2005
0
                ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2006
0
                ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2007
0
            }
2008
0
        }
2009
0
        i4_count++;
2010
0
    }
2011
0
}
2012
2013
/**
2014
******************************************************************************
2015
*  @brief Gives a result fxn ptr for a index [x] where x is as:
2016
*         0 : single pt, no partial updates, 1 best result
2017
*         1 : single pt, no partial updates, N best results
2018
*         2 : single pt,    partial updates, 1 best result
2019
*         3 : single pt,    partial updates, N best results
2020
*         0 : grid     , no partial updates, 1 best result
2021
*         1 : grid     , no partial updates, N best results
2022
*         2 : grid     ,    partial updates, 1 best result
2023
*         3 : grid     ,    partial updates, N best results
2024
******************************************************************************
2025
*/
2026
2027
static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1,   UPD_RES_PT_NPU_BESTN,
2028
                                              UPD_RES_PT_PU_BEST1,    UPD_RES_PT_PU_BESTN,
2029
                                              UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2030
                                              UPD_RES_GRID_PU_BEST1,  UPD_RES_GRID_PU_BESTN };
2031
2032
/**
2033
********************************************************************************
2034
*  @fn     hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2035
*
2036
*  @brief  Obtains the suitable result function that evaluates COST and also
2037
*           computes one or more best results for point/grid, single part or
2038
*           more than one part.
2039
*
2040
*  @param[in]  i4_grid_mask : Mask containing which of 9 grid pts active
2041
*
2042
*  @param[in]  i4_part_mask : Mask containing which of the 17 parts active
2043
*
2044
*  @param[in]  i4_num_results: Number of active results
2045
*
2046
*  @return   Pointer to the appropriate result update function
2047
********************************************************************************
2048
*/
2049
PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2050
16.8M
{
2051
16.8M
    S32 i4_is_grid = (i4_grid_mask != 1);
2052
16.8M
    S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2053
16.8M
    S32 i4_res_gt1 = (i4_num_results > 1);
2054
16.8M
    S32 id;
2055
2056
16.8M
    id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2057
2058
16.8M
    return (g_pf_result_fxn[id]);
2059
16.8M
}
2060
2061
void hme_calc_sad_and_2_best_results(
2062
    hme_search_prms_t *ps_search_prms,
2063
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2064
    err_prms_t *ps_err_prms,
2065
    result_upd_prms_t *ps_result_prms,
2066
    U08 **ppu1_ref,
2067
    S32 i4_ref_stride)
2068
0
{
2069
0
    S32 i4_candt;
2070
0
    S32 i4_inp_off;
2071
0
    S32 i4_ref_offset;
2072
0
    S32 i4_num_nodes;
2073
2074
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2075
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2076
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2077
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2078
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2079
2080
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2081
0
    search_node_t *ps_search_node;
2082
2083
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2084
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2085
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2086
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2087
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2088
0
    ps_search_node = ps_search_prms->ps_search_nodes;
2089
2090
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2091
0
    {
2092
        /**********************************************************************/
2093
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2094
        /**********************************************************************/
2095
0
        {
2096
0
            WORD32 b, c, d;
2097
0
            UWORD8 *pu1_cur_ptr;
2098
0
            UWORD8 *pu1_ref_ptr;
2099
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2100
2101
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2102
0
            {
2103
0
                continue;
2104
0
            }
2105
2106
0
            ps_err_prms->pu1_inp =
2107
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2108
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2109
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2110
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2111
2112
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2113
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2114
2115
            /* Loop to compute the SAD's */
2116
0
            {
2117
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2118
0
                for(b = 0; b < NUM_4X4; b++)
2119
0
                {
2120
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2121
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2122
2123
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2124
0
                    {
2125
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2126
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2127
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2128
0
                        {
2129
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2130
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2131
0
                        }
2132
0
                    }
2133
0
                }
2134
2135
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2136
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2137
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2138
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2139
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2140
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2141
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2142
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2143
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2144
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2145
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2146
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2147
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2148
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2149
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2150
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2151
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2152
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2153
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2154
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2155
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2156
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2157
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2158
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2159
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2160
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2161
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2162
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2163
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2164
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2165
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2166
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2167
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2168
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2169
0
            }
2170
0
        }
2171
2172
0
        {
2173
0
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2174
0
            S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2175
0
            S32 best_node_cost;
2176
0
            S32 second_best_node_cost;
2177
2178
0
            {
2179
0
                S16 mvdx1, mvdy1;
2180
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2181
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2182
0
                S32 pred_lx = i4_search_idx;
2183
2184
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2185
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2186
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2187
2188
0
                S32 inp_shift = 2;
2189
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2190
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2191
0
                S32 lambda = ps_pred_ctxt->lambda;
2192
0
                S32 rnd = 1 << (lambda_q_shift - 1);
2193
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2194
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2195
0
                S32 ref_bits =
2196
0
                    ps_pred_ctxt
2197
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2198
2199
0
                COMPUTE_DIFF_MV(
2200
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2201
2202
0
                mvdx1 = ABS(mvdx1);
2203
0
                mvdy1 = ABS(mvdy1);
2204
2205
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2206
0
                             (mvdy1 > 0) + ref_bits + 2;
2207
2208
0
                i4_mv_cost *= lambda;
2209
0
                i4_mv_cost += rnd;
2210
0
                i4_mv_cost >>= lambda_q_shift;
2211
2212
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2213
0
            }
2214
2215
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2216
            best candidates for that partition*/
2217
2218
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2219
0
            {
2220
0
                S32 update_required = 0;
2221
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2222
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2223
2224
                /*Calculate total cost*/
2225
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2226
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2227
2228
                /*****************************************************************/
2229
                /* We do not labor through the results if the total cost worse   */
2230
                /* than the last of the results.                                 */
2231
                /*****************************************************************/
2232
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2233
0
                second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2234
2235
0
                if(i4_tot_cost < second_best_node_cost)
2236
0
                {
2237
0
                    update_required = 2;
2238
2239
                    /*************************************************************/
2240
                    /* Identify where the current result isto be placed.Basically*/
2241
                    /* find the node which has cost just higher thannodeundertest*/
2242
                    /*************************************************************/
2243
0
                    if(i4_tot_cost < best_node_cost)
2244
0
                    {
2245
0
                        update_required = 1;
2246
0
                    }
2247
0
                    else if(i4_tot_cost == best_node_cost)
2248
0
                    {
2249
0
                        update_required = 0;
2250
0
                    }
2251
2252
0
                    if(update_required == 2)
2253
0
                    {
2254
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2255
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2256
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2257
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2258
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2259
0
                    }
2260
0
                    else if(update_required == 1)
2261
0
                    {
2262
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2263
0
                            ps_mv_refine_ctxt->i2_tot_cost[0][index];
2264
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2265
0
                            ps_mv_refine_ctxt->i2_mv_cost[0][index];
2266
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2267
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2268
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2269
0
                            ps_mv_refine_ctxt->i2_ref_idx[0][index];
2270
2271
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2272
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2273
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2274
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2275
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2276
0
                    }
2277
0
                }
2278
0
            }
2279
0
        }
2280
0
        ps_search_node++;
2281
0
    }
2282
2283
0
    {
2284
0
        WORD32 i4_i;
2285
0
        WORD32 part_id;
2286
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2287
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2288
0
        {
2289
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2290
0
            if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2291
0
            {
2292
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2293
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2294
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2295
2296
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2297
0
            }
2298
0
            if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2299
0
            {
2300
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2301
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2302
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2303
2304
0
                ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2305
0
            }
2306
0
        }
2307
0
    }
2308
0
}
2309
2310
void hme_calc_sad_and_2_best_results_subpel(
2311
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2312
0
{
2313
0
    S32 i4_candt;
2314
0
    S32 i4_num_nodes;
2315
2316
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2317
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2318
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2319
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2320
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2321
2322
0
    mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2323
0
    ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2324
0
    i4_num_nodes = 1;
2325
2326
    /* Run through each of the candts in a loop */
2327
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2328
0
    {
2329
        /**********************************************************************/
2330
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2331
        /**********************************************************************/
2332
0
        {
2333
0
            WORD32 b, c, d;
2334
0
            UWORD8 *pu1_cur_ptr;
2335
0
            UWORD8 *pu1_ref_ptr;
2336
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2337
2338
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2339
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2340
2341
            /* Loop to compute the SAD's */
2342
0
            {
2343
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2344
0
                for(b = 0; b < NUM_4X4; b++)
2345
0
                {
2346
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2347
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2348
2349
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2350
0
                    {
2351
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2352
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2353
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2354
0
                        {
2355
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2356
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2357
0
                        }
2358
0
                    }
2359
0
                }
2360
2361
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2362
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2363
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2364
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2365
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2366
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2367
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2368
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2369
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2370
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2371
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2372
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2373
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2374
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2375
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2376
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2377
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2378
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2379
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2380
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2381
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2382
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2383
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2384
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2385
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2386
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2387
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2388
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2389
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2390
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2391
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2392
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2393
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2394
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2395
0
            }
2396
0
        }
2397
        /**********************************************************************/
2398
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
2399
        /**********************************************************************/
2400
0
        {
2401
0
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2402
0
            S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2403
0
            S32 best_node_cost;
2404
0
            S32 second_best_node_cost;
2405
2406
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2407
            best candidates for that partition*/
2408
2409
0
            for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2410
0
            {
2411
0
                S32 update_required = 0;
2412
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2413
0
                S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2414
2415
                /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2416
0
                i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2417
2418
                /*Calculate total cost*/
2419
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2420
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2421
2422
                /*****************************************************************/
2423
                /* We do not labor through the results if the total cost worse   */
2424
                /* than the last of the results.                                 */
2425
                /*****************************************************************/
2426
0
                best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2427
0
                second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2428
2429
0
                if(i4_tot_cost < second_best_node_cost)
2430
0
                {
2431
0
                    update_required = 2;
2432
2433
                    /*************************************************************/
2434
                    /* Identify where the current result isto be placed.Basically*/
2435
                    /* find the node which has cost just higher thannodeundertest*/
2436
                    /*************************************************************/
2437
0
                    if(i4_tot_cost < best_node_cost)
2438
0
                    {
2439
0
                        update_required = 1;
2440
0
                    }
2441
0
                    else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2442
0
                    {
2443
0
                        update_required = 0;
2444
0
                    }
2445
0
                    if(update_required == 2)
2446
0
                    {
2447
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2448
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2449
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2450
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2451
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2452
0
                    }
2453
0
                    else if(update_required == 1)
2454
0
                    {
2455
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2456
0
                            ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2457
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2458
0
                            ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2459
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2460
0
                            ps_subpel_refine_ctxt->i2_mv_x[0][index];
2461
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2462
0
                            ps_subpel_refine_ctxt->i2_mv_y[0][index];
2463
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2464
0
                            ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2465
2466
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2467
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2468
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2469
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2470
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2471
0
                    }
2472
0
                }
2473
0
            }
2474
0
        }
2475
0
    }
2476
2477
0
    {
2478
0
        WORD32 i4_count = 0;
2479
0
        for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2480
0
        {
2481
0
            WORD32 j;
2482
0
            for(j = 0; j < 2; j++)
2483
0
            {
2484
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2485
0
                {
2486
0
                    ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2487
0
                }
2488
0
            }
2489
0
        }
2490
0
    }
2491
0
}
2492
2493
void hme_calc_stim_injected_sad_and_2_best_results(
2494
    hme_search_prms_t *ps_search_prms,
2495
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2496
    err_prms_t *ps_err_prms,
2497
    result_upd_prms_t *ps_result_prms,
2498
    U08 **ppu1_ref,
2499
    S32 i4_ref_stride)
2500
0
{
2501
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2502
0
    search_node_t *ps_search_node;
2503
2504
0
    S32 i4_candt;
2505
0
    S32 i4_count;
2506
0
    S32 i4_inp_off;
2507
0
    S32 i4_ref_offset;
2508
0
    S32 i4_num_nodes;
2509
0
    ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2510
0
        au8_final_ref_sigmaXSquared[17];
2511
0
    UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2512
0
    S32 *pi4_valid_part_ids;
2513
2514
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2515
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2516
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2517
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2518
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2519
2520
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2521
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2522
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2523
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2524
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2525
0
    ps_search_node = ps_search_prms->ps_search_nodes;
2526
0
    pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2527
2528
    /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2529
0
    au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2530
0
    au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2531
2532
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2533
0
    {
2534
0
        {
2535
0
            WORD32 b, c, d;
2536
0
            UWORD8 *pu1_cur_ptr;
2537
0
            UWORD8 *pu1_ref_ptr;
2538
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2539
2540
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2541
0
            {
2542
0
                continue;
2543
0
            }
2544
2545
0
            ps_err_prms->pu1_inp =
2546
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2547
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2548
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2549
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2550
2551
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2552
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2553
2554
            /* Loop to compute the SAD's */
2555
0
            {
2556
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2557
0
                for(b = 0; b < NUM_4X4; b++)
2558
0
                {
2559
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2560
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2561
2562
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2563
0
                    {
2564
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2565
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2566
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2567
0
                        {
2568
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2569
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2570
0
                        }
2571
0
                    }
2572
0
                }
2573
2574
                /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2575
0
                hme_compute_sigmaX_and_sigmaXSquared(
2576
0
                    pu1_ref_ptr,
2577
0
                    ref_buf_stride,
2578
0
                    au4_4x4_ref_sigmaX,
2579
0
                    au4_4x4_ref_sigmaXSquared,
2580
0
                    4,
2581
0
                    4,
2582
0
                    16,
2583
0
                    16,
2584
0
                    1,
2585
0
                    4);
2586
2587
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2588
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2589
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2590
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2591
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2592
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2593
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2594
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2595
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2596
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2597
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2598
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2599
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2600
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2601
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2602
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2603
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2604
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2605
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2606
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2607
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2608
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2609
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2610
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2611
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2612
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2613
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2614
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2615
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2616
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2617
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2618
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2619
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2620
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2621
0
            }
2622
0
        }
2623
2624
0
        {
2625
0
            S32 i4_sad, i4_mv_cost, i4_tot_cost;
2626
0
            S32 best_node_cost;
2627
0
            S32 second_best_node_cost;
2628
0
            ULWORD64 u8_temp_var, u8_temp_var1;
2629
0
            ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2630
2631
0
            {
2632
0
                S16 mvdx1, mvdy1;
2633
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2634
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2635
0
                S32 pred_lx = i4_search_idx;
2636
2637
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2638
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2639
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2640
2641
0
                S32 inp_shift = 2;
2642
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2643
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2644
0
                S32 lambda = ps_pred_ctxt->lambda;
2645
0
                S32 rnd = 1 << (lambda_q_shift - 1);
2646
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2647
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2648
0
                S32 ref_bits =
2649
0
                    ps_pred_ctxt
2650
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2651
2652
0
                COMPUTE_DIFF_MV(
2653
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2654
2655
0
                mvdx1 = ABS(mvdx1);
2656
0
                mvdy1 = ABS(mvdy1);
2657
2658
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2659
0
                             (mvdy1 > 0) + ref_bits + 2;
2660
2661
0
                i4_mv_cost *= lambda;
2662
0
                i4_mv_cost += rnd;
2663
0
                i4_mv_cost >>= lambda_q_shift;
2664
2665
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2666
0
            }
2667
2668
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2669
0
            {
2670
0
                S32 i4_stim_injected_sad;
2671
0
                S32 i4_stim_injected_cost;
2672
0
                S32 i4_noise_term;
2673
0
                unsigned long u4_shift_val;
2674
0
                S32 i4_bits_req;
2675
2676
0
                S32 update_required = 0;
2677
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2678
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2679
2680
0
                WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2681
2682
0
                S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2683
2684
0
                if(ps_search_prms->i4_alpha_stim_multiplier)
2685
0
                {
2686
                    /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2687
0
                    hme_compute_final_sigma_of_pu_from_base_blocks(
2688
0
                        au4_4x4_ref_sigmaX,
2689
0
                        au4_4x4_ref_sigmaXSquared,
2690
0
                        au8_final_ref_sigmaX,
2691
0
                        au8_final_ref_sigmaXSquared,
2692
0
                        16,
2693
0
                        4,
2694
0
                        part_id,
2695
0
                        4);
2696
2697
0
                    u8_ref_X_Square =
2698
0
                        (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2699
0
                    u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2700
2701
                    /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2702
                    /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2703
                    /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2704
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
2705
0
                        au8_final_src_sigmaX,
2706
0
                        au8_final_src_sigmaXSquared,
2707
0
                        &u8_src_var,
2708
0
                        i4_inv_wt,
2709
0
                        ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2710
0
                        ps_wt_inp_prms->wpred_log_wdc,
2711
0
                        part_id);
2712
2713
0
                    u8_ref_var = u8_ref_var >> u4_shift_val;
2714
2715
                    /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2716
0
                    GETRANGE64(i4_bits_req, u8_ref_var);
2717
2718
0
                    if(i4_bits_req > 27)
2719
0
                    {
2720
0
                        u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2721
0
                        u8_src_var = u8_src_var >> (i4_bits_req - 27);
2722
0
                    }
2723
2724
0
                    if(u8_src_var == u8_ref_var)
2725
0
                    {
2726
0
                        u8_temp_var = (1 << STIM_Q_FORMAT);
2727
0
                    }
2728
0
                    else
2729
0
                    {
2730
0
                        u8_temp_var = (2 * u8_src_var * u8_ref_var);
2731
0
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2732
0
                        u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2733
0
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2734
0
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
2735
0
                    }
2736
2737
0
                    i4_noise_term = (UWORD32)u8_temp_var;
2738
2739
0
                    ASSERT(i4_noise_term >= 0);
2740
2741
0
                    i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2742
0
                }
2743
0
                else
2744
0
                {
2745
0
                    i4_noise_term = 0;
2746
0
                }
2747
0
                u8_pure_dist = pi4_sad_grid[part_id];
2748
0
                u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2749
0
                u8_pure_dist += (1 << ((i4_q_level)-1));
2750
0
                i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2751
2752
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2753
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2754
0
                i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2755
0
                i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2756
2757
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2758
0
                second_best_node_cost =
2759
0
                    CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2760
2761
0
                if(i4_stim_injected_cost < second_best_node_cost)
2762
0
                {
2763
0
                    update_required = 2;
2764
2765
0
                    if(i4_stim_injected_cost < best_node_cost)
2766
0
                    {
2767
0
                        update_required = 1;
2768
0
                    }
2769
0
                    else if(i4_stim_injected_cost == best_node_cost)
2770
0
                    {
2771
0
                        update_required = 0;
2772
0
                    }
2773
2774
0
                    if(update_required == 2)
2775
0
                    {
2776
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2777
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2778
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2779
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2780
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2781
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2782
0
                    }
2783
0
                    else if(update_required == 1)
2784
0
                    {
2785
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2786
0
                            ps_mv_refine_ctxt->i2_tot_cost[0][index];
2787
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2788
0
                            ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2789
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2790
0
                            ps_mv_refine_ctxt->i2_mv_cost[0][index];
2791
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2792
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2793
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2794
0
                            ps_mv_refine_ctxt->i2_ref_idx[0][index];
2795
2796
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2797
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2798
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2799
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2800
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2801
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2802
0
                    }
2803
0
                }
2804
0
            }
2805
0
        }
2806
2807
0
        ps_search_node++;
2808
0
    }
2809
2810
0
    {
2811
0
        WORD32 i4_i;
2812
0
        WORD32 part_id;
2813
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2814
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2815
0
        {
2816
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2817
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2818
0
            {
2819
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2820
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2821
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2822
2823
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2824
0
            }
2825
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2826
0
            {
2827
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2828
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2829
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2830
2831
0
                ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2832
0
            }
2833
0
        }
2834
0
    }
2835
0
}
2836
2837
void hme_calc_sad_and_1_best_result(
2838
    hme_search_prms_t *ps_search_prms,
2839
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2840
    err_prms_t *ps_err_prms,
2841
    result_upd_prms_t *ps_result_prms,
2842
    U08 **ppu1_ref,
2843
    S32 i4_ref_stride)
2844
4.51M
{
2845
4.51M
    S32 i4_candt;
2846
4.51M
    S32 i4_inp_off;
2847
4.51M
    S32 i4_ref_offset;
2848
4.51M
    S32 i4_num_nodes;
2849
2850
4.51M
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2851
4.51M
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2852
4.51M
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2853
4.51M
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2854
4.51M
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2855
2856
4.51M
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2857
4.51M
    search_node_t *ps_search_node;
2858
2859
4.51M
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2860
4.51M
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2861
4.51M
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2862
4.51M
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2863
4.51M
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2864
4.51M
    ps_search_node = ps_search_prms->ps_search_nodes;
2865
2866
38.4M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2867
33.9M
    {
2868
        /**********************************************************************/
2869
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2870
        /**********************************************************************/
2871
33.9M
        {
2872
33.9M
            WORD32 b, c, d;
2873
33.9M
            UWORD8 *pu1_cur_ptr;
2874
33.9M
            UWORD8 *pu1_ref_ptr;
2875
33.9M
            UWORD16 au2_4x4_sad[NUM_4X4];
2876
2877
33.9M
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2878
0
            {
2879
0
                continue;
2880
0
            }
2881
2882
33.9M
            ps_err_prms->pu1_inp =
2883
33.9M
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2884
33.9M
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2885
33.9M
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2886
33.9M
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2887
2888
33.9M
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2889
33.9M
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2890
2891
            /* Loop to compute the SAD's */
2892
33.9M
            {
2893
33.9M
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2894
576M
                for(b = 0; b < NUM_4X4; b++)
2895
543M
                {
2896
543M
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2897
543M
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2898
2899
2.71G
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2900
2.17G
                    {
2901
2.17G
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2902
2.17G
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2903
10.8G
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2904
8.68G
                        {
2905
8.68G
                            au2_4x4_sad[b] += (UWORD16)ABS((
2906
8.68G
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2907
8.68G
                        }
2908
2.17G
                    }
2909
543M
                }
2910
2911
33.9M
                pi4_sad_grid[PART_ID_NxN_TL] =
2912
33.9M
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2913
33.9M
                pi4_sad_grid[PART_ID_NxN_TR] =
2914
33.9M
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2915
33.9M
                pi4_sad_grid[PART_ID_NxN_BL] =
2916
33.9M
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2917
33.9M
                pi4_sad_grid[PART_ID_NxN_BR] =
2918
33.9M
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2919
33.9M
                pi4_sad_grid[PART_ID_Nx2N_L] =
2920
33.9M
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2921
33.9M
                pi4_sad_grid[PART_ID_Nx2N_R] =
2922
33.9M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2923
33.9M
                pi4_sad_grid[PART_ID_2NxN_T] =
2924
33.9M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2925
33.9M
                pi4_sad_grid[PART_ID_2NxN_B] =
2926
33.9M
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2927
33.9M
                pi4_sad_grid[PART_ID_nLx2N_L] =
2928
33.9M
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2929
33.9M
                pi4_sad_grid[PART_ID_nRx2N_R] =
2930
33.9M
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2931
33.9M
                pi4_sad_grid[PART_ID_2NxnU_T] =
2932
33.9M
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2933
33.9M
                pi4_sad_grid[PART_ID_2NxnD_B] =
2934
33.9M
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2935
33.9M
                pi4_sad_grid[PART_ID_2Nx2N] =
2936
33.9M
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2937
33.9M
                pi4_sad_grid[PART_ID_2NxnU_B] =
2938
33.9M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2939
33.9M
                pi4_sad_grid[PART_ID_2NxnD_T] =
2940
33.9M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2941
33.9M
                pi4_sad_grid[PART_ID_nRx2N_L] =
2942
33.9M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2943
33.9M
                pi4_sad_grid[PART_ID_nLx2N_R] =
2944
33.9M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2945
33.9M
            }
2946
33.9M
        }
2947
2948
0
        {
2949
33.9M
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2950
33.9M
            S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2951
33.9M
            S32 best_node_cost;
2952
33.9M
            S32 second_best_node_cost;
2953
2954
33.9M
            {
2955
33.9M
                S16 mvdx1, mvdy1;
2956
33.9M
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2957
33.9M
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2958
33.9M
                S32 pred_lx = i4_search_idx;
2959
2960
33.9M
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2961
33.9M
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2962
33.9M
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2963
2964
33.9M
                S32 inp_shift = 2;
2965
33.9M
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2966
33.9M
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2967
33.9M
                S32 lambda = ps_pred_ctxt->lambda;
2968
33.9M
                S32 rnd = 1 << (lambda_q_shift - 1);
2969
33.9M
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2970
33.9M
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2971
33.9M
                S32 ref_bits =
2972
33.9M
                    ps_pred_ctxt
2973
33.9M
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2974
2975
33.9M
                COMPUTE_DIFF_MV(
2976
33.9M
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2977
2978
33.9M
                mvdx1 = ABS(mvdx1);
2979
33.9M
                mvdy1 = ABS(mvdy1);
2980
2981
33.9M
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2982
33.9M
                             (mvdy1 > 0) + ref_bits + 2;
2983
2984
33.9M
                i4_mv_cost *= lambda;
2985
33.9M
                i4_mv_cost += rnd;
2986
33.9M
                i4_mv_cost >>= lambda_q_shift;
2987
2988
33.9M
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2989
33.9M
            }
2990
2991
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2992
            best candidates for that partition*/
2993
2994
308M
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2995
274M
            {
2996
274M
                S32 update_required = 0;
2997
274M
                S32 part_id = pi4_valid_part_ids[i4_count];
2998
274M
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2999
3000
                /*Calculate total cost*/
3001
274M
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3002
274M
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3003
3004
                /*****************************************************************/
3005
                /* We do not labor through the results if the total cost worse   */
3006
                /* than the last of the results.                                 */
3007
                /*****************************************************************/
3008
274M
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3009
274M
                second_best_node_cost = SHRT_MAX;
3010
3011
274M
                if(i4_tot_cost < second_best_node_cost)
3012
274M
                {
3013
274M
                    update_required = 0;
3014
3015
                    /*************************************************************/
3016
                    /* Identify where the current result isto be placed.Basically*/
3017
                    /* find the node which has cost just higher thannodeundertest*/
3018
                    /*************************************************************/
3019
274M
                    if(i4_tot_cost < best_node_cost)
3020
59.6M
                    {
3021
59.6M
                        update_required = 1;
3022
59.6M
                    }
3023
215M
                    else if(i4_tot_cost == best_node_cost)
3024
46.1M
                    {
3025
46.1M
                        update_required = 0;
3026
46.1M
                    }
3027
3028
274M
                    if(update_required == 2)
3029
0
                    {
3030
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3031
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3032
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3033
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3034
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3035
0
                    }
3036
274M
                    else if(update_required == 1)
3037
59.6M
                    {
3038
59.6M
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3039
59.6M
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3040
59.6M
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3041
59.6M
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3042
59.6M
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3043
59.6M
                    }
3044
274M
                }
3045
274M
            }
3046
33.9M
        }
3047
33.9M
        ps_search_node++;
3048
33.9M
    }
3049
3050
4.51M
    {
3051
4.51M
        WORD32 i4_i;
3052
4.51M
        WORD32 part_id;
3053
4.51M
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3054
41.7M
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3055
37.2M
        {
3056
37.2M
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3057
37.2M
            if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3058
508k
            {
3059
508k
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3060
508k
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3061
508k
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3062
3063
508k
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3064
508k
            }
3065
37.2M
        }
3066
4.51M
    }
3067
4.51M
}
3068
3069
void hme_calc_stim_injected_sad_and_1_best_result(
3070
    hme_search_prms_t *ps_search_prms,
3071
    wgt_pred_ctxt_t *ps_wt_inp_prms,
3072
    err_prms_t *ps_err_prms,
3073
    result_upd_prms_t *ps_result_prms,
3074
    U08 **ppu1_ref,
3075
    S32 i4_ref_stride)
3076
0
{
3077
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
3078
0
    search_node_t *ps_search_node;
3079
3080
0
    S32 i4_candt;
3081
0
    S32 i4_count;
3082
0
    S32 i4_inp_off;
3083
0
    S32 i4_ref_offset;
3084
0
    S32 i4_num_nodes;
3085
0
    ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3086
0
        au8_final_ref_sigmaXSquared[17];
3087
0
    UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3088
0
    S32 *pi4_valid_part_ids;
3089
3090
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3091
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3092
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3093
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3094
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3095
3096
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3097
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3098
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
3099
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3100
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3101
0
    ps_search_node = ps_search_prms->ps_search_nodes;
3102
0
    pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3103
3104
    /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3105
0
    au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3106
0
    au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3107
3108
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3109
0
    {
3110
0
        {
3111
0
            WORD32 b, c, d;
3112
0
            UWORD8 *pu1_cur_ptr;
3113
0
            UWORD8 *pu1_ref_ptr;
3114
0
            UWORD16 au2_4x4_sad[NUM_4X4];
3115
3116
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3117
0
            {
3118
0
                continue;
3119
0
            }
3120
3121
0
            ps_err_prms->pu1_inp =
3122
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3123
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3124
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3125
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3126
3127
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
3128
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3129
3130
            /* Loop to compute the SAD's */
3131
0
            {
3132
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3133
0
                for(b = 0; b < NUM_4X4; b++)
3134
0
                {
3135
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3136
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3137
3138
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3139
0
                    {
3140
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
3141
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
3142
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3143
0
                        {
3144
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
3145
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3146
0
                        }
3147
0
                    }
3148
0
                }
3149
3150
                /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3151
0
                hme_compute_sigmaX_and_sigmaXSquared(
3152
0
                    pu1_ref_ptr,
3153
0
                    ref_buf_stride,
3154
0
                    au4_4x4_ref_sigmaX,
3155
0
                    au4_4x4_ref_sigmaXSquared,
3156
0
                    4,
3157
0
                    4,
3158
0
                    16,
3159
0
                    16,
3160
0
                    1,
3161
0
                    4);
3162
3163
0
                pi4_sad_grid[PART_ID_NxN_TL] =
3164
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3165
0
                pi4_sad_grid[PART_ID_NxN_TR] =
3166
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3167
0
                pi4_sad_grid[PART_ID_NxN_BL] =
3168
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3169
0
                pi4_sad_grid[PART_ID_NxN_BR] =
3170
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3171
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
3172
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3173
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
3174
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3175
0
                pi4_sad_grid[PART_ID_2NxN_T] =
3176
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3177
0
                pi4_sad_grid[PART_ID_2NxN_B] =
3178
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3179
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
3180
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3181
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
3182
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3183
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
3184
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3185
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
3186
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3187
0
                pi4_sad_grid[PART_ID_2Nx2N] =
3188
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3189
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
3190
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3191
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
3192
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3193
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
3194
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3195
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
3196
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3197
0
            }
3198
0
        }
3199
3200
0
        {
3201
0
            S32 i4_sad, i4_mv_cost, i4_tot_cost;
3202
0
            S32 best_node_cost;
3203
0
            S32 second_best_node_cost;
3204
0
            ULWORD64 u8_temp_var, u8_temp_var1;
3205
0
            ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3206
3207
0
            {
3208
0
                S16 mvdx1, mvdy1;
3209
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3210
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3211
0
                S32 pred_lx = i4_search_idx;
3212
3213
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3214
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3215
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3216
3217
0
                S32 inp_shift = 2;
3218
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3219
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3220
0
                S32 lambda = ps_pred_ctxt->lambda;
3221
0
                S32 rnd = 1 << (lambda_q_shift - 1);
3222
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3223
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3224
0
                S32 ref_bits =
3225
0
                    ps_pred_ctxt
3226
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3227
3228
0
                COMPUTE_DIFF_MV(
3229
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3230
3231
0
                mvdx1 = ABS(mvdx1);
3232
0
                mvdy1 = ABS(mvdy1);
3233
3234
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3235
0
                             (mvdy1 > 0) + ref_bits + 2;
3236
3237
0
                i4_mv_cost *= lambda;
3238
0
                i4_mv_cost += rnd;
3239
0
                i4_mv_cost >>= lambda_q_shift;
3240
3241
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
3242
0
            }
3243
3244
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3245
0
            {
3246
0
                S32 i4_stim_injected_sad;
3247
0
                S32 i4_stim_injected_cost;
3248
0
                S32 i4_noise_term;
3249
0
                unsigned long u4_shift_val;
3250
0
                S32 i4_bits_req;
3251
3252
0
                S32 update_required = 0;
3253
0
                S32 part_id = pi4_valid_part_ids[i4_count];
3254
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3255
3256
0
                WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3257
3258
0
                S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3259
3260
0
                if(ps_search_prms->i4_alpha_stim_multiplier)
3261
0
                {
3262
                    /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3263
0
                    hme_compute_final_sigma_of_pu_from_base_blocks(
3264
0
                        au4_4x4_ref_sigmaX,
3265
0
                        au4_4x4_ref_sigmaXSquared,
3266
0
                        au8_final_ref_sigmaX,
3267
0
                        au8_final_ref_sigmaXSquared,
3268
0
                        16,
3269
0
                        4,
3270
0
                        part_id,
3271
0
                        4);
3272
3273
0
                    u8_ref_X_Square =
3274
0
                        (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3275
0
                    u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3276
3277
                    /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3278
                    /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3279
                    /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3280
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
3281
0
                        au8_final_src_sigmaX,
3282
0
                        au8_final_src_sigmaXSquared,
3283
0
                        &u8_src_var,
3284
0
                        i4_inv_wt,
3285
0
                        ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3286
0
                        ps_wt_inp_prms->wpred_log_wdc,
3287
0
                        part_id);
3288
3289
0
                    u8_ref_var = u8_ref_var >> u4_shift_val;
3290
3291
                    /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3292
0
                    GETRANGE64(i4_bits_req, u8_ref_var);
3293
3294
0
                    if(i4_bits_req > 27)
3295
0
                    {
3296
0
                        u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3297
0
                        u8_src_var = u8_src_var >> (i4_bits_req - 27);
3298
0
                    }
3299
3300
0
                    if(u8_src_var == u8_ref_var)
3301
0
                    {
3302
0
                        u8_temp_var = (1 << STIM_Q_FORMAT);
3303
0
                    }
3304
0
                    else
3305
0
                    {
3306
0
                        u8_temp_var = (2 * u8_src_var * u8_ref_var);
3307
0
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3308
0
                        u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3309
0
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3310
0
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
3311
0
                    }
3312
3313
0
                    i4_noise_term = (UWORD32)u8_temp_var;
3314
3315
0
                    ASSERT(i4_noise_term >= 0);
3316
3317
0
                    i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3318
0
                }
3319
0
                else
3320
0
                {
3321
0
                    i4_noise_term = 0;
3322
0
                }
3323
0
                u8_pure_dist = pi4_sad_grid[part_id];
3324
0
                u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3325
0
                u8_pure_dist += (1 << ((i4_q_level)-1));
3326
0
                i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3327
3328
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3329
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3330
0
                i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3331
0
                i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3332
3333
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3334
0
                second_best_node_cost = SHRT_MAX;
3335
3336
0
                if(i4_stim_injected_cost < second_best_node_cost)
3337
0
                {
3338
0
                    update_required = 0;
3339
3340
0
                    if(i4_stim_injected_cost < best_node_cost)
3341
0
                    {
3342
0
                        update_required = 1;
3343
0
                    }
3344
0
                    else if(i4_stim_injected_cost == best_node_cost)
3345
0
                    {
3346
0
                        update_required = 0;
3347
0
                    }
3348
3349
0
                    if(update_required == 2)
3350
0
                    {
3351
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3352
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3353
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3354
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3355
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3356
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3357
0
                    }
3358
0
                    else if(update_required == 1)
3359
0
                    {
3360
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3361
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3362
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3363
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3364
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3365
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3366
0
                    }
3367
0
                }
3368
0
            }
3369
0
        }
3370
3371
0
        ps_search_node++;
3372
0
    }
3373
3374
0
    {
3375
0
        WORD32 i4_i;
3376
0
        WORD32 part_id;
3377
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3378
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3379
0
        {
3380
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3381
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3382
0
            {
3383
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3384
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3385
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3386
3387
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3388
0
            }
3389
0
        }
3390
0
    }
3391
0
}
3392
3393
void hme_calc_sad_and_1_best_result_subpel(
3394
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3395
12.6M
{
3396
12.6M
    S32 i4_candt;
3397
12.6M
    S32 i4_num_nodes;
3398
3399
12.6M
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3400
3401
12.6M
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3402
12.6M
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3403
12.6M
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3404
12.6M
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3405
3406
12.6M
    mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3407
12.6M
    ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3408
12.6M
    i4_num_nodes = 1;
3409
3410
    /* Run through each of the candts in a loop */
3411
25.2M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3412
12.6M
    {
3413
        /**********************************************************************/
3414
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3415
        /**********************************************************************/
3416
12.6M
        {
3417
12.6M
            WORD32 b, c, d;
3418
12.6M
            UWORD8 *pu1_cur_ptr;
3419
12.6M
            UWORD8 *pu1_ref_ptr;
3420
12.6M
            UWORD16 au2_4x4_sad[NUM_4X4];
3421
3422
12.6M
            pu1_cur_ptr = ps_err_prms->pu1_inp;
3423
12.6M
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3424
3425
            /* Loop to compute the SAD's */
3426
12.6M
            {
3427
12.6M
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3428
214M
                for(b = 0; b < NUM_4X4; b++)
3429
201M
                {
3430
201M
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3431
201M
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3432
3433
1.00G
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3434
806M
                    {
3435
806M
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
3436
806M
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
3437
4.03G
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3438
3.22G
                        {
3439
3.22G
                            au2_4x4_sad[b] += (UWORD16)ABS((
3440
3.22G
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3441
3.22G
                        }
3442
806M
                    }
3443
201M
                }
3444
3445
12.6M
                pi4_sad_grid[PART_ID_NxN_TL] =
3446
12.6M
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3447
12.6M
                pi4_sad_grid[PART_ID_NxN_TR] =
3448
12.6M
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3449
12.6M
                pi4_sad_grid[PART_ID_NxN_BL] =
3450
12.6M
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3451
12.6M
                pi4_sad_grid[PART_ID_NxN_BR] =
3452
12.6M
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3453
12.6M
                pi4_sad_grid[PART_ID_Nx2N_L] =
3454
12.6M
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3455
12.6M
                pi4_sad_grid[PART_ID_Nx2N_R] =
3456
12.6M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3457
12.6M
                pi4_sad_grid[PART_ID_2NxN_T] =
3458
12.6M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3459
12.6M
                pi4_sad_grid[PART_ID_2NxN_B] =
3460
12.6M
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3461
12.6M
                pi4_sad_grid[PART_ID_nLx2N_L] =
3462
12.6M
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3463
12.6M
                pi4_sad_grid[PART_ID_nRx2N_R] =
3464
12.6M
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3465
12.6M
                pi4_sad_grid[PART_ID_2NxnU_T] =
3466
12.6M
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3467
12.6M
                pi4_sad_grid[PART_ID_2NxnD_B] =
3468
12.6M
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3469
12.6M
                pi4_sad_grid[PART_ID_2Nx2N] =
3470
12.6M
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3471
12.6M
                pi4_sad_grid[PART_ID_2NxnU_B] =
3472
12.6M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3473
12.6M
                pi4_sad_grid[PART_ID_2NxnD_T] =
3474
12.6M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3475
12.6M
                pi4_sad_grid[PART_ID_nRx2N_L] =
3476
12.6M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3477
12.6M
                pi4_sad_grid[PART_ID_nLx2N_R] =
3478
12.6M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3479
12.6M
            }
3480
12.6M
        }
3481
        /**********************************************************************/
3482
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3483
        /**********************************************************************/
3484
12.6M
        {
3485
12.6M
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3486
12.6M
            S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3487
12.6M
            S32 best_node_cost;
3488
12.6M
            S32 second_best_node_cost;
3489
3490
            /*For each valid partition, update the refine_prm structure to reflect the best and second
3491
            best candidates for that partition*/
3492
3493
28.0M
            for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3494
15.4M
            {
3495
15.4M
                S32 update_required = 0;
3496
15.4M
                S32 part_id = pi4_valid_part_ids[i4_count];
3497
15.4M
                S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3498
3499
                /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3500
15.4M
                i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3501
3502
                /*Calculate total cost*/
3503
15.4M
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3504
15.4M
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3505
3506
                /*****************************************************************/
3507
                /* We do not labor through the results if the total cost worse   */
3508
                /* than the last of the results.                                 */
3509
                /*****************************************************************/
3510
15.4M
                best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3511
15.4M
                second_best_node_cost = SHRT_MAX;
3512
3513
15.4M
                if(i4_tot_cost < second_best_node_cost)
3514
15.4M
                {
3515
15.4M
                    update_required = 0;
3516
3517
                    /*************************************************************/
3518
                    /* Identify where the current result isto be placed.Basically*/
3519
                    /* find the node which has cost just higher thannodeundertest*/
3520
                    /*************************************************************/
3521
15.4M
                    if(i4_tot_cost < best_node_cost)
3522
348k
                    {
3523
348k
                        update_required = 1;
3524
348k
                    }
3525
15.0M
                    else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3526
11.8M
                    {
3527
11.8M
                        update_required = 0;
3528
11.8M
                    }
3529
15.4M
                    if(update_required == 2)
3530
0
                    {
3531
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3532
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3533
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3534
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3535
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3536
0
                    }
3537
15.4M
                    else if(update_required == 1)
3538
348k
                    {
3539
348k
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3540
348k
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3541
348k
                        ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3542
348k
                        ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3543
348k
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3544
348k
                    }
3545
15.4M
                }
3546
15.4M
            }
3547
12.6M
        }
3548
12.6M
    }
3549
3550
12.6M
    {
3551
12.6M
        WORD32 i4_count = 0;
3552
226M
        for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3553
214M
        {
3554
214M
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3555
198M
            {
3556
198M
                ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3557
198M
            }
3558
214M
        }
3559
12.6M
    }
3560
12.6M
}
3561
3562
/**
3563
********************************************************************************
3564
*  @fn     hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3565
*                                              wgt_pred_ctxt_t *ps_wt_inp_prms,
3566
*                                              err_prms_t *ps_err_prms,
3567
*                                              result_upd_prms_t *ps_result_prms,
3568
*                                              U08 **ppu1_ref,
3569
*                                              S32 i4_ref_stride)
3570
*
3571
*  @brief   Run thorugh the provided candidates and compute the point SAD and
3572
*           cost and update the results in the order
3573
*
3574
*  @param[in]  ps_search_prms
3575
*  @param[in]  ps_wt_inp_prms
3576
*  @param[in]  ps_err_prms
3577
*  @param[out] ps_result_prms
3578
*  @param[in]  ppu1_ref
3579
*  @param[in]  i4_ref_stride
3580
*
3581
*  @return   None
3582
********************************************************************************
3583
*/
3584
3585
void hme_calc_pt_sad_and_result_explicit(
3586
    hme_search_prms_t *ps_search_prms,
3587
    wgt_pred_ctxt_t *ps_wt_inp_prms,
3588
    err_prms_t *ps_err_prms,
3589
    result_upd_prms_t *ps_result_prms,
3590
    U08 **ppu1_ref,
3591
    S32 i4_ref_stride)
3592
4.89M
{
3593
4.89M
    WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3594
4.89M
    WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3595
3596
4.89M
    search_node_t *ps_search_node;
3597
4.89M
    BLK_SIZE_T e_blk_size;
3598
4.89M
    PF_SAD_FXN_T pf_sad_fxn;
3599
4.89M
    PF_RESULT_FXN_T pf_hme_result_fxn;
3600
3601
4.89M
    i4_grid_mask = 0x1; /* Point SAD */
3602
3603
    /* Get the parameters required */
3604
4.89M
    i4_part_mask = ps_search_prms->i4_part_mask;
3605
4.89M
    e_blk_size = ps_search_prms->e_blk_size;
3606
4.89M
    i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3607
4.89M
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3608
4.89M
    ps_search_node = ps_search_prms->ps_search_nodes;
3609
3610
4.89M
    i4_inp_stride = ps_search_prms->i4_inp_stride;
3611
    /* Move to the location of the search blk in inp buffer */
3612
4.89M
    i4_inp_off = ps_search_prms->i4_cu_x_off;
3613
4.89M
    i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3614
4.89M
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3615
3616
4.89M
    pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3617
    /**********************************************************************/
3618
    /* we have a sparsely populated SAD grid of size 9x17.                */
3619
    /* the id of the results in the grid is shown                         */
3620
    /*     5   2   6                                                      */
3621
    /*     1   0   3                                                      */
3622
    /*     7   4   8                                                      */
3623
    /* The motivation for choosing a grid like this is that               */
3624
    /* in case of no refinement, the central location is                  */
3625
    /* the first entry in the grid                                        */
3626
    /* Also for diamond, the 4 entries get considered first               */
3627
    /* This is consistent with the diamond notation used in               */
3628
    /* subpel refinement. To Check                                        */
3629
    /* Update the results for the given search candt                      */
3630
    /* returns the cost of the 2Nx2N partition                            */
3631
    /**********************************************************************/
3632
3633
    /* Get the modified update result fun. with CLIP16 of cost to match   */
3634
    /* with SIMD */
3635
4.89M
    pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3636
3637
44.5M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3638
39.7M
    {
3639
39.7M
        if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3640
0
            continue;
3641
3642
        /* initialize minimum cost for this candidate. As we search around */
3643
        /* this candidate, this is used to check early exit, when in any   */
3644
        /* given iteration, the center pt of the grid is lowest value      */
3645
39.7M
        ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3646
3647
39.7M
        ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3648
39.7M
        ps_err_prms->i4_grid_mask = i4_grid_mask;
3649
3650
39.7M
        ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3651
39.7M
        ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3652
39.7M
        ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3653
3654
        /**********************************************************************/
3655
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3656
        /**********************************************************************/
3657
39.7M
        pf_sad_fxn(ps_err_prms);
3658
3659
        /**********************************************************************/
3660
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3661
        /**********************************************************************/
3662
39.7M
        ps_result_prms->i4_grid_mask = i4_grid_mask;
3663
39.7M
        ps_result_prms->ps_search_node_base = ps_search_node;
3664
39.7M
        pf_hme_result_fxn(ps_result_prms);
3665
3666
39.7M
        ps_search_node++;
3667
39.7M
    }
3668
4.89M
}
3669
3670
/**
3671
********************************************************************************
3672
*  @fn     hme_set_mvp_node(search_results_t *ps_search_results,
3673
*                           search_node_t *ps_candt_prj_coloc,
3674
*                           S08 i1_ref_idx)
3675
*
3676
*  @brief   Set node used for motion vector predictor computation
3677
*           Either TR or L is compared to projected colocated and
3678
*           closest is decided as MVP
3679
*
3680
*  @param[in]  ps_search_results
3681
*
3682
*  @param[in]  ps_candt_prj_coloc
3683
*
3684
*  @param[in]  i1_ref_idx
3685
*
3686
*  @return   None
3687
********************************************************************************
3688
*/
3689
void hme_set_mvp_node(
3690
    search_results_t *ps_search_results,
3691
    search_node_t *ps_candt_prj_coloc,
3692
    U08 u1_pred_lx,
3693
    U08 u1_default_ref_id)
3694
3.48M
{
3695
3.48M
    S32 i;
3696
3.48M
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3697
3.48M
    pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3698
3.48M
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3699
3700
3.48M
    S32 inp_shift = 2;
3701
3.48M
    S32 pred_shift;
3702
3.48M
    S32 ref_bits;
3703
3.48M
    S32 mv_p_x, mv_p_y;
3704
3.48M
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
3705
3706
3.48M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3707
3708
    /*************************************************************************/
3709
    /* Priority to bottom left availability. Else we go to left. If both are */
3710
    /* not available, then a remains null                                    */
3711
    /*************************************************************************/
3712
3.48M
    if(ps_pred_nodes->ps_l->u1_is_avail)
3713
2.65M
    {
3714
2.65M
        ps_pred_node_a = ps_pred_nodes->ps_l;
3715
2.65M
    }
3716
3717
3.48M
    if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3718
1.52M
    {
3719
1.52M
        ps_pred_node_b = ps_pred_nodes->ps_tr;
3720
1.52M
    }
3721
1.95M
    else
3722
1.95M
    {
3723
1.95M
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
3724
1.95M
        ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3725
1.95M
    }
3726
3727
3.48M
    if(ps_pred_node_a == NULL)
3728
830k
    {
3729
830k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
3730
830k
        ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3731
3732
830k
        if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3733
209k
        {
3734
209k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3735
209k
            ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3736
209k
        }
3737
830k
    }
3738
3739
3.48M
    if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3740
1.18M
    {
3741
1.18M
        SCALE_FOR_POC_DELTA(
3742
1.18M
            mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3743
1.18M
    }
3744
2.29M
    else
3745
2.29M
    {
3746
2.29M
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3747
2.29M
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3748
2.29M
    }
3749
3.48M
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3750
3.48M
    COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3751
3.48M
    mvdx1 = ABS(mvdx1);
3752
3.48M
    mvdy1 = ABS(mvdy1);
3753
3754
3.48M
    if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3755
1.18M
    {
3756
1.18M
        SCALE_FOR_POC_DELTA(
3757
1.18M
            mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3758
1.18M
    }
3759
2.29M
    else
3760
2.29M
    {
3761
2.29M
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3762
2.29M
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3763
2.29M
    }
3764
3.48M
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3765
3.48M
    COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3766
3.48M
    mvdx2 = ABS(mvdx2);
3767
3.48M
    mvdy2 = ABS(mvdy2);
3768
3769
3.48M
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3770
357k
    {
3771
6.42M
        for(i = 0; i < TOT_NUM_PARTS; i++)
3772
6.07M
        {
3773
6.07M
            ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3774
6.07M
        }
3775
357k
    }
3776
3.12M
    else
3777
3.12M
    {
3778
56.2M
        for(i = 0; i < TOT_NUM_PARTS; i++)
3779
53.1M
        {
3780
53.1M
            ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3781
53.1M
        }
3782
3.12M
    }
3783
3.48M
}