Coverage Report

Created: 2026-01-09 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/hme_err_compute.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/*!
22
***************************************************************************
23
* \file hme_err_compute.c
24
*
25
* \brief
26
*    SAD / SATD routines for error computation
27
*
28
* Detailed_description : Contains various types of SAD/SATD routines for
29
*   error computation between a given input and reference ptr. The SAD
30
*   routines can evaluate for either a single point or a grid, and can
31
*   evaluate with either partial updates or no partial updates. Partial
32
*   updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33
*   addition to the main 8x8 block SAD.
34
*
35
* \date
36
*    22/9/2012
37
*
38
* \author  Ittiam
39
***************************************************************************
40
*/
41
42
/*****************************************************************************/
43
/* File Includes                                                             */
44
/*****************************************************************************/
45
/* System include files */
46
#include <stdio.h>
47
#include <string.h>
48
#include <stdlib.h>
49
#include <assert.h>
50
#include <stdarg.h>
51
#include <math.h>
52
#include <limits.h>
53
54
/* User include files */
55
#include "ihevc_typedefs.h"
56
#include "itt_video_api.h"
57
#include "ihevce_api.h"
58
59
#include "rc_cntrl_param.h"
60
#include "rc_frame_info_collector.h"
61
#include "rc_look_ahead_params.h"
62
63
#include "ihevc_defs.h"
64
#include "ihevc_structs.h"
65
#include "ihevc_platform_macros.h"
66
#include "ihevc_deblk.h"
67
#include "ihevc_itrans_recon.h"
68
#include "ihevc_chroma_itrans_recon.h"
69
#include "ihevc_chroma_intra_pred.h"
70
#include "ihevc_intra_pred.h"
71
#include "ihevc_inter_pred.h"
72
#include "ihevc_mem_fns.h"
73
#include "ihevc_padding.h"
74
#include "ihevc_weighted_pred.h"
75
#include "ihevc_sao.h"
76
#include "ihevc_resi_trans.h"
77
#include "ihevc_quant_iquant_ssd.h"
78
#include "ihevc_cabac_tables.h"
79
80
#include "ihevce_defs.h"
81
#include "ihevce_lap_enc_structs.h"
82
#include "ihevce_multi_thrd_structs.h"
83
#include "ihevce_multi_thrd_funcs.h"
84
#include "ihevce_me_common_defs.h"
85
#include "ihevce_had_satd.h"
86
#include "ihevce_error_codes.h"
87
#include "ihevce_bitstream.h"
88
#include "ihevce_cabac.h"
89
#include "ihevce_rdoq_macros.h"
90
#include "ihevce_function_selector.h"
91
#include "ihevce_enc_structs.h"
92
#include "ihevce_entropy_structs.h"
93
#include "ihevce_cmn_utils_instr_set_router.h"
94
#include "ihevce_enc_loop_structs.h"
95
#include "ihevce_bs_compute_ctb.h"
96
#include "ihevce_global_tables.h"
97
#include "ihevce_dep_mngr_interface.h"
98
#include "hme_datatype.h"
99
#include "hme_interface.h"
100
#include "hme_common_defs.h"
101
#include "hme_defs.h"
102
#include "ihevce_me_instr_set_router.h"
103
#include "hme_globals.h"
104
#include "hme_utils.h"
105
#include "hme_coarse.h"
106
#include "hme_refine.h"
107
#include "hme_err_compute.h"
108
#include "hme_common_utils.h"
109
#include "hme_search_algo.h"
110
#include "ihevce_stasino_helpers.h"
111
112
/******************************************************************************
113
*                         MACRO DEFINITIONS
114
******************************************************************************/
115
116
/*****************************************************************************/
117
/* Theoritically, the various types of SAD functions that are needed for     */
118
/* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119
/* more optimal than SADs that are to be evaluated for a grid of 3x3. The    */
120
/* SADs to be evaluated at a grid are classified as separate functions, since*/
121
/* evaluating them on a single function call helps reuse inputs for a small  */
122
/* grid of 3x3. Also, if no partial updates are required, there are 3 basic  */
123
/* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124
/* 16K, K any number. For partial updates, it is assumed that the block size */
125
/* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done  */
126
/* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127
/* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then  */
128
/* basic SAD unit is 8x8.                                                    */
129
/*****************************************************************************/
130
131
#define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132
#define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133
#define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134
#define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135
#define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136
#define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137
#define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138
#define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139
140
/*******************************************************************************
141
*                         FUNCTION DEFINITIONS
142
*******************************************************************************/
143
S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144
3.97M
{
145
3.97M
    if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146
2.27M
       (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147
1.99M
       (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148
1.99M
    {
149
1.99M
        return 0;
150
1.99M
    }
151
1.97M
    return -1;
152
3.97M
}
153
154
void compute_4x4_sads_for_16x16_blk(
155
    grid_ctxt_t *ps_grid, /* Grid ctxt */
156
    UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157
    WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158
    UWORD16 **
159
        u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160
    cand_t *ps_cand, /* Return the list of candidates evaluated */
161
    WORD32 *num_cands /* Number of candidates that were processed */
162
)
163
0
{
164
0
    WORD32 a, b, c, d, i;
165
0
    WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166
0
    WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167
    //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168
    //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169
    /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170
0
    WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171
0
    WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172
0
    WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175
0
    cand_t *cand0 = ps_cand;
176
0
    UWORD16 au2_4x4_sad[NUM_4X4];
177
178
0
    *num_cands = 0;
179
180
    /* Loop to fill up the cand_t array and to calculate num_cands */
181
0
    for(i = 0; i < ps_grid->num_grids; i++)
182
0
    {
183
0
        WORD32 j;
184
0
        WORD32 mask = ps_grid->pi4_grd_mask[i];
185
0
        UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186
0
        WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187
0
        WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188
189
0
        for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190
0
        {
191
0
            if(mask & 1)
192
0
            {
193
0
                *num_cands = *num_cands + 1;
194
0
                cand0->grid_ix = i;
195
0
                cand0->ref_idx = ps_grid->p_ref_idx[i];
196
0
                cand0->pu1_ref_ptr =
197
0
                    pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198
0
                cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199
0
                cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200
0
                cand0++;
201
0
            }
202
0
        }
203
0
    }
204
205
    /* Loop to compute the SAD's */
206
0
    for(a = 0; a < *num_cands; a++)
207
0
    {
208
0
        cand_t *cand = ps_cand + a;
209
0
        memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210
0
        for(b = 0; b < NUM_4X4; b++)
211
0
        {
212
0
            WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213
0
            WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214
215
0
            for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216
0
            {
217
0
                WORD32 z_cur = (cur_buf_stride)*c + t1;
218
0
                WORD32 z_ref = (ref_buf_stride)*c + t2;
219
0
                for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220
0
                {
221
0
                    au2_4x4_sad[b] += (UWORD16)ABS(
222
0
                        (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223
0
                }
224
0
            }
225
0
        }
226
227
0
        u2_part_sads[PART_ID_NxN_TL][a] =
228
0
            (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229
0
        u2_part_sads[PART_ID_NxN_TR][a] =
230
0
            (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231
0
        u2_part_sads[PART_ID_NxN_BL][a] =
232
0
            (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233
0
        u2_part_sads[PART_ID_NxN_BR][a] =
234
0
            (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235
0
        u2_part_sads[PART_ID_Nx2N_L][a] =
236
0
            u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237
0
        u2_part_sads[PART_ID_Nx2N_R][a] =
238
0
            u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239
0
        u2_part_sads[PART_ID_2NxN_T][a] =
240
0
            u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241
0
        u2_part_sads[PART_ID_2NxN_B][a] =
242
0
            u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243
0
        u2_part_sads[PART_ID_nLx2N_L][a] =
244
0
            (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245
0
        u2_part_sads[PART_ID_nRx2N_R][a] =
246
0
            (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247
0
        u2_part_sads[PART_ID_2NxnU_T][a] =
248
0
            (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249
0
        u2_part_sads[PART_ID_2NxnD_B][a] =
250
0
            (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251
0
        u2_part_sads[PART_ID_2Nx2N][a] =
252
0
            u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253
0
        u2_part_sads[PART_ID_2NxnU_B][a] =
254
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255
0
        u2_part_sads[PART_ID_2NxnD_T][a] =
256
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257
0
        u2_part_sads[PART_ID_nRx2N_L][a] =
258
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259
0
        u2_part_sads[PART_ID_nLx2N_R][a] =
260
0
            u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261
0
    }
262
0
}
263
264
/**
265
********************************************************************************
266
*  @fn     compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267
*                                       UWORD8      *pu1_cur_ptr,
268
*                                       WORD32      cur_buf_stride,
269
*                                       WORD32     **pi4_part_sads,
270
*                                       cand_t      *ps_cand,
271
*                                       WORD32      *num_cands
272
*
273
*  @brief  Computes partial SADs and updates partition results for an MxM blk
274
*          and does so for several grids of points. This can be used for
275
*          32x32/64x64 blks with 17 partition updates
276
*
277
*
278
*  @param[in]  ps_grid : Pointer to grid ctxt that has multiple grid of max
279
*                        9 pts per grid
280
*
281
*  @param[in]  pu1_cur_ptr : Top left of input buffer
282
*
283
*  @param[in]  pi4_part_sads : array of pointers, each entry pointing to
284
*                             results to be updated for a given partition
285
*
286
*  @return   The ps_search_results structure has the best result updated for
287
*            the 2Nx2N partition alone.
288
289
********************************************************************************
290
*/
291
void compute_part_sads_for_MxM_blk(
292
    grid_ctxt_t *ps_grid,
293
    UWORD8 *pu1_cur_ptr,
294
    WORD32 cur_buf_stride,
295
    WORD32 **pp_part_sads,
296
    cand_t *ps_cand,
297
    WORD32 *num_cands,
298
    CU_SIZE_T e_cu_size)
299
3.04M
{
300
3.04M
    WORD32 a, b, c, d, i;
301
3.04M
    WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302
3.04M
    WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303
304
    /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305
3.04M
    WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306
3.04M
    WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307
3.04M
    WORD32 shift = (WORD32)e_cu_size;
308
309
3.04M
    WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310
3.04M
    WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311
3.04M
    WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312
    /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313
3.04M
    WORD32 num_rows_in_nxn = 2 << shift;
314
3.04M
    WORD32 num_pixels_in_row = 2 << shift;
315
3.04M
    cand_t *cand0 = ps_cand;
316
    /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317
    /* needed for AMP cases.                                              */
318
3.04M
    WORD32 a_nxn_sad[NUM_4X4];
319
3.04M
    *num_cands = 0;
320
321
    /* Loop to fill up the cand_t array and to calculate num_cands */
322
6.09M
    for(i = 0; i < ps_grid->num_grids; i++)
323
3.04M
    {
324
3.04M
        WORD32 j;
325
3.04M
        WORD32 mask = ps_grid->pi4_grd_mask[i];
326
3.04M
        UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327
3.04M
        WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328
3.04M
        WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329
330
30.4M
        for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331
27.4M
        {
332
27.4M
            if(mask & 1)
333
3.04M
            {
334
3.04M
                *num_cands = *num_cands + 1;
335
3.04M
                cand0->grid_ix = i;
336
3.04M
                cand0->ref_idx = ps_grid->p_ref_idx[i];
337
3.04M
                cand0->pu1_ref_ptr =
338
3.04M
                    pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339
3.04M
                cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340
3.04M
                cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341
3.04M
                cand0++;
342
3.04M
            }
343
27.4M
        }
344
3.04M
    }
345
346
    /* Loop to compute the SAD's */
347
6.09M
    for(a = 0; a < *num_cands; a++)
348
3.04M
    {
349
3.04M
        cand_t *cand = ps_cand + a;
350
3.04M
        memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351
51.8M
        for(b = 0; b < NUM_4X4; b++)
352
48.7M
        {
353
48.7M
            WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354
48.7M
            WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355
356
150M
            for(c = 0; c < num_rows_in_nxn; c++)
357
102M
            {
358
102M
                WORD32 z_cur = (cur_buf_stride)*c + t1;
359
102M
                WORD32 z_ref = (ref_buf_stride)*c + t2;
360
348M
                for(d = 0; d < num_pixels_in_row; d++)
361
246M
                {
362
246M
                    a_nxn_sad[b] += (WORD32)ABS(
363
246M
                        (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364
246M
                         ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365
246M
                }
366
102M
            }
367
48.7M
        }
368
369
3.04M
        pp_part_sads[PART_ID_NxN_TL][a] =
370
3.04M
            (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371
3.04M
        pp_part_sads[PART_ID_NxN_TR][a] =
372
3.04M
            (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373
3.04M
        pp_part_sads[PART_ID_NxN_BL][a] =
374
3.04M
            (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375
3.04M
        pp_part_sads[PART_ID_NxN_BR][a] =
376
3.04M
            (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377
3.04M
        pp_part_sads[PART_ID_Nx2N_L][a] =
378
3.04M
            pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379
3.04M
        pp_part_sads[PART_ID_Nx2N_R][a] =
380
3.04M
            pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381
3.04M
        pp_part_sads[PART_ID_2NxN_T][a] =
382
3.04M
            pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383
3.04M
        pp_part_sads[PART_ID_2NxN_B][a] =
384
3.04M
            pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385
3.04M
        pp_part_sads[PART_ID_nLx2N_L][a] =
386
3.04M
            (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387
3.04M
        pp_part_sads[PART_ID_nRx2N_R][a] =
388
3.04M
            (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389
3.04M
        pp_part_sads[PART_ID_2NxnU_T][a] =
390
3.04M
            (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391
3.04M
        pp_part_sads[PART_ID_2NxnD_B][a] =
392
3.04M
            (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393
3.04M
        pp_part_sads[PART_ID_2Nx2N][a] =
394
3.04M
            pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395
3.04M
        pp_part_sads[PART_ID_2NxnU_B][a] =
396
3.04M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397
3.04M
        pp_part_sads[PART_ID_2NxnD_T][a] =
398
3.04M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399
3.04M
        pp_part_sads[PART_ID_nRx2N_L][a] =
400
3.04M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401
3.04M
        pp_part_sads[PART_ID_nLx2N_R][a] =
402
3.04M
            pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403
3.04M
    }
404
3.04M
}
405
406
void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407
0
{
408
0
    grid_ctxt_t s_grid;
409
0
    cand_t as_candt[9];
410
0
    U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411
0
    U16 *apu2_sad_grid[TOT_NUM_PARTS];
412
0
    hme_mv_t s_mv = { 0, 0 };
413
0
    S32 i4_ref_idx = 0, i;
414
0
    S32 num_candts = 0;
415
0
    s_grid.num_grids = 1;
416
0
    s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417
0
    s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418
0
    s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419
0
    s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420
0
    s_grid.p_mv = &s_mv;
421
0
    s_grid.p_ref_idx = &i4_ref_idx;
422
0
    for(i = 0; i < 9; i++)
423
0
    {
424
0
        if(s_grid.pi4_grd_mask[0] & (1 << i))
425
0
            num_candts++;
426
0
    }
427
428
0
    for(i = 0; i < TOT_NUM_PARTS; i++)
429
0
        apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430
431
0
    compute_4x4_sads_for_16x16_blk(
432
0
        &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433
0
    for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434
0
    {
435
0
        ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436
0
    }
437
0
}
438
439
void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440
2.31M
{
441
2.31M
    U08 *pu1_inp_base, *pu1_ref_c;
442
2.31M
    S32 *pi4_sad = ps_prms->pi4_sad_grid;
443
2.31M
    S32 i, grid_count = 0;
444
2.31M
    S32 step = ps_prms->i4_step;
445
2.31M
    S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446
447
2.31M
    ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448
449
    //assert(ps_prms->i4_blk_ht <= 8);
450
    //assert(ps_prms->i4_blk_wd <= 8);
451
23.1M
    for(i = 0; i < 9; i++)
452
20.8M
    {
453
20.8M
        if(ps_prms->i4_grid_mask & (1 << i))
454
19.6M
            grid_count++;
455
20.8M
    }
456
2.31M
    pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457
458
2.31M
    pu1_inp_base = ps_prms->pu1_inp;
459
2.31M
    pu1_ref_c = ps_prms->pu1_ref;
460
23.1M
    for(i = 0; i < 9; i++)
461
20.8M
    {
462
20.8M
        S32 sad = 0, j, k;
463
20.8M
        U08 *pu1_inp, *pu1_ref;
464
465
20.8M
        if(!(ps_prms->i4_grid_mask & (1 << i)))
466
1.15M
            continue;
467
19.6M
        pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468
19.6M
        pu1_ref += y_off * gai1_grid_id_to_y[i];
469
19.6M
        pu1_inp = pu1_inp_base;
470
471
137M
        for(j = 0; j < ps_prms->i4_blk_ht; j++)
472
118M
        {
473
748M
            for(k = 0; k < ps_prms->i4_blk_wd; k++)
474
629M
            {
475
629M
                sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476
629M
            }
477
118M
            pu1_inp += ps_prms->i4_inp_stride;
478
118M
            pu1_ref += ps_prms->i4_ref_stride;
479
118M
        }
480
19.6M
        *pi4_sad++ = sad;
481
19.6M
    }
482
2.31M
}
483
484
WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485
    WORD32 ht,
486
    WORD32 wd,
487
    UWORD8 *pu1_inp,
488
    UWORD8 *pu1_ref,
489
    WORD32 i4_inp_stride,
490
    WORD32 i4_ref_stride)
491
6.01M
{
492
6.01M
    WORD32 i, j;
493
6.01M
    WORD32 sad = 0;
494
51.6M
    for(i = 0; i < ht; i++)
495
45.6M
    {
496
418M
        for(j = 0; j < wd; j++)
497
372M
        {
498
372M
            sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499
372M
        }
500
45.6M
        pu1_inp += i4_inp_stride;
501
45.6M
        pu1_ref += i4_ref_stride;
502
45.6M
    }
503
6.01M
    return sad;
504
6.01M
}
505
506
void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507
6.01M
{
508
6.01M
    S32 wd, ht;
509
6.01M
    U08 *pu1_inp, *pu1_ref;
510
511
6.01M
    wd = ps_prms->i4_blk_wd;
512
6.01M
    ht = ps_prms->i4_blk_ht;
513
514
6.01M
    pu1_inp = ps_prms->pu1_inp;
515
6.01M
    pu1_ref = ps_prms->pu1_ref;
516
517
6.01M
    ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518
6.01M
        ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519
6.01M
}
520
521
void compute_satd_8bit(err_prms_t *ps_prms)
522
3.99M
{
523
3.99M
    U08 *pu1_origin;
524
3.99M
    S32 src_strd;
525
3.99M
    U08 *pu1_pred_buf;
526
3.99M
    S32 dst_strd;
527
3.99M
    S32 wd, ht;
528
3.99M
    U32 u4_sad = 0;
529
3.99M
    WORD32 x, y;
530
3.99M
    U08 *u1_pi0, *u1_pi1;
531
532
3.99M
    pu1_origin = ps_prms->pu1_inp;
533
3.99M
    pu1_pred_buf = ps_prms->pu1_ref;
534
3.99M
    src_strd = ps_prms->i4_inp_stride;
535
3.99M
    dst_strd = ps_prms->i4_ref_stride;
536
3.99M
    wd = ps_prms->i4_blk_wd;
537
3.99M
    ht = ps_prms->i4_blk_ht;
538
539
3.99M
    u1_pi0 = pu1_origin;
540
3.99M
    u1_pi1 = pu1_pred_buf;
541
542
    /* Follows the following logic:
543
    For block sizes less than or equal to 16X16, the basic transform size is 4x4
544
    For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545
3.99M
    if((wd > 0x10) || (ht > 0x10))
546
202k
    {
547
871k
        for(y = 0; y < ht; y += 8)
548
669k
        {
549
3.27M
            for(x = 0; x < wd; x += 8)
550
2.60M
            {
551
2.60M
                u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552
2.60M
                    &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553
2.60M
            }
554
669k
            u1_pi0 += src_strd * 8;
555
669k
            u1_pi1 += dst_strd * 8;
556
669k
        }
557
202k
    }
558
3.79M
    else
559
3.79M
    {
560
14.1M
        for(y = 0; y < ht; y += 4)
561
10.3M
        {
562
39.6M
            for(x = 0; x < wd; x += 4)
563
29.2M
            {
564
29.2M
                u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565
29.2M
                    &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566
29.2M
            }
567
10.3M
            u1_pi0 += src_strd * 4;
568
10.3M
            u1_pi1 += dst_strd * 4;
569
10.3M
        }
570
3.79M
    }
571
572
3.99M
    ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573
3.99M
}
574
575
void hme_init_pred_part(
576
    pred_ctxt_t *ps_pred_ctxt,
577
    search_node_t *ps_tl,
578
    search_node_t *ps_t,
579
    search_node_t *ps_tr,
580
    search_node_t *ps_l,
581
    search_node_t *ps_bl,
582
    search_node_t *ps_coloc,
583
    search_node_t *ps_zeromv,
584
    search_node_t **pps_proj_coloc,
585
    PART_ID_T e_part_id)
586
3.66M
{
587
3.66M
    pred_candt_nodes_t *ps_candt_nodes;
588
589
3.66M
    ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590
591
3.66M
    ps_candt_nodes->ps_tl = ps_tl;
592
3.66M
    ps_candt_nodes->ps_tr = ps_tr;
593
3.66M
    ps_candt_nodes->ps_t = ps_t;
594
3.66M
    ps_candt_nodes->ps_l = ps_l;
595
3.66M
    ps_candt_nodes->ps_bl = ps_bl;
596
3.66M
    ps_candt_nodes->ps_coloc = ps_coloc;
597
3.66M
    ps_candt_nodes->ps_zeromv = ps_zeromv;
598
3.66M
    ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599
3.66M
}
600
601
void hme_init_pred_ctxt_no_encode(
602
    pred_ctxt_t *ps_pred_ctxt,
603
    search_results_t *ps_search_results,
604
    search_node_t *ps_top_candts,
605
    search_node_t *ps_left_candts,
606
    search_node_t **pps_proj_coloc_candts,
607
    search_node_t *ps_coloc_candts,
608
    search_node_t *ps_zeromv_candt,
609
    S32 pred_lx,
610
    S32 lambda,
611
    S32 lambda_q_shift,
612
    U08 **ppu1_ref_bits_tlu,
613
    S16 *pi2_ref_scf)
614
26.7k
{
615
26.7k
    search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616
26.7k
    search_node_t *ps_coloc;
617
26.7k
    PART_ID_T e_part_id;
618
619
    /* Assume that resolution is subpel to begin with */
620
26.7k
    ps_pred_ctxt->mv_pel = 0;  // FPEL
621
622
    /* lambda and pred_lx (PRED_L0/PRED_L1) */
623
26.7k
    ps_pred_ctxt->lambda = lambda;
624
26.7k
    ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625
26.7k
    ps_pred_ctxt->pred_lx = pred_lx;
626
26.7k
    ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627
26.7k
    ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628
26.7k
    ps_pred_ctxt->proj_used = 0;
629
630
    /* Bottom left should not be valid */
631
26.7k
    ASSERT(ps_left_candts[2].u1_is_avail == 0);
632
26.7k
    ps_invalid = &ps_left_candts[2];
633
634
    /*************************************************************************/
635
    /* for the case of no encode, the idea is to set up cants as follows     */
636
    /*                                                                       */
637
    /*    ____ ______________                                                */
638
    /*   | TL | T  | T1 | TR |                                               */
639
    /*   |____|____|____|____|                                               */
640
    /*   | L  | b0 | b1 |                                                    */
641
    /*   |____|____|____|                                                    */
642
    /*   | L1 | b2 | b3 |                                                    */
643
    /*   |____|____|____|                                                    */
644
    /*   | BL |                                                              */
645
    /*   |____|                                                              */
646
    /*                                                                       */
647
    /*  If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1      */
648
    /* and L=L1. topleft, top and topright are TL,T,TR respectively          */
649
    /* Left and bottom left is L and BL respectively.                        */
650
    /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8)  */
651
    /*  For the 4 subblocks (partids 4-7)                                    */
652
    /*                                                                       */
653
    /*  Block   Left   Top   Top Left   Top Right   Bottom Left             */
654
    /*    b0    L      T      TL          T1          L1                     */
655
    /*    b1    b0     T1     T           TR          BL(invalid)            */
656
    /*    b2    L1     b0     L0          b1          BL (invalid)           */
657
    /*    b3    b2     b1     b0          BL(inv)     BL (inv)               */
658
    /*                                                                       */
659
    /* Note : For block b1, bottom left pts to b2, which is not yet ready    */
660
    /*  hence it is kept invalid and made to pt to BL. For block b3 top rt   */
661
    /* is invalid and hence made to pt to BL which is invalid.               */
662
    /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663
    /*************************************************************************/
664
665
    /* ps_coloc always points to a fixe candt (global) */
666
    /* TODO : replace incoming ps_coloc from global to geniune coloc */
667
26.7k
    ps_coloc = ps_coloc_candts;
668
669
    /* INITIALIZATION OF 8x8 BLK */
670
26.7k
    ps_tl = ps_top_candts;
671
26.7k
    ps_t = ps_tl + 2;
672
26.7k
    ps_tr = ps_t + 1;
673
26.7k
    ps_l = ps_left_candts + 1;
674
26.7k
    ps_bl = ps_invalid;
675
26.7k
    e_part_id = PART_ID_2Nx2N;
676
26.7k
    hme_init_pred_part(
677
26.7k
        ps_pred_ctxt,
678
26.7k
        ps_tl,
679
26.7k
        ps_t,
680
26.7k
        ps_tr,
681
26.7k
        ps_l,
682
26.7k
        ps_bl,
683
26.7k
        ps_coloc,
684
26.7k
        ps_zeromv_candt,
685
26.7k
        pps_proj_coloc_candts,
686
26.7k
        e_part_id);
687
688
    /* INITIALIZATION OF 4x4 TL BLK */
689
26.7k
    e_part_id = PART_ID_NxN_TL;
690
26.7k
    ps_tl = ps_top_candts;
691
26.7k
    ps_t = ps_tl + 1;
692
26.7k
    ps_tr = ps_t + 1;
693
26.7k
    ps_l = ps_left_candts;
694
26.7k
    ps_bl = ps_l + 1;
695
26.7k
    hme_init_pred_part(
696
26.7k
        ps_pred_ctxt,
697
26.7k
        ps_tl,
698
26.7k
        ps_t,
699
26.7k
        ps_tr,
700
26.7k
        ps_l,
701
26.7k
        ps_bl,
702
26.7k
        ps_coloc,
703
26.7k
        ps_zeromv_candt,
704
26.7k
        pps_proj_coloc_candts,
705
26.7k
        e_part_id);
706
707
    /* INITIALIZATION OF 4x4 TR BLK */
708
26.7k
    e_part_id = PART_ID_NxN_TR;
709
26.7k
    ps_tl = ps_top_candts + 1;
710
26.7k
    ps_t = ps_tl + 1;
711
26.7k
    ps_tr = ps_t + 1;
712
26.7k
    ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713
26.7k
    ps_bl = ps_invalid;
714
26.7k
    hme_init_pred_part(
715
26.7k
        ps_pred_ctxt,
716
26.7k
        ps_tl,
717
26.7k
        ps_t,
718
26.7k
        ps_tr,
719
26.7k
        ps_l,
720
26.7k
        ps_bl,
721
26.7k
        ps_coloc,
722
26.7k
        ps_zeromv_candt,
723
26.7k
        pps_proj_coloc_candts,
724
26.7k
        e_part_id);
725
726
    /* INITIALIZATION OF 4x4 BL BLK */
727
26.7k
    e_part_id = PART_ID_NxN_BL;
728
26.7k
    ps_tl = ps_left_candts;
729
26.7k
    ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730
26.7k
    ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731
26.7k
    ps_l = ps_left_candts + 1;
732
26.7k
    ps_bl = ps_invalid;  //invalid
733
26.7k
    hme_init_pred_part(
734
26.7k
        ps_pred_ctxt,
735
26.7k
        ps_tl,
736
26.7k
        ps_t,
737
26.7k
        ps_tr,
738
26.7k
        ps_l,
739
26.7k
        ps_bl,
740
26.7k
        ps_coloc,
741
26.7k
        ps_zeromv_candt,
742
26.7k
        pps_proj_coloc_candts,
743
26.7k
        e_part_id);
744
745
    /* INITIALIZATION OF 4x4 BR BLK */
746
26.7k
    e_part_id = PART_ID_NxN_BR;
747
26.7k
    ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748
26.7k
    ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749
26.7k
    ps_tr = ps_invalid;  // invalid
750
26.7k
    ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751
26.7k
    ps_bl = ps_invalid;  // invalid
752
26.7k
    hme_init_pred_part(
753
26.7k
        ps_pred_ctxt,
754
26.7k
        ps_tl,
755
26.7k
        ps_t,
756
26.7k
        ps_tr,
757
26.7k
        ps_l,
758
26.7k
        ps_bl,
759
26.7k
        ps_coloc,
760
26.7k
        ps_zeromv_candt,
761
26.7k
        pps_proj_coloc_candts,
762
26.7k
        e_part_id);
763
26.7k
}
764
765
void hme_init_pred_ctxt_encode(
766
    pred_ctxt_t *ps_pred_ctxt,
767
    search_results_t *ps_search_results,
768
    search_node_t *ps_coloc_candts,
769
    search_node_t *ps_zeromv_candt,
770
    mv_grid_t *ps_mv_grid,
771
    S32 pred_lx,
772
    S32 lambda,
773
    S32 lambda_q_shift,
774
    U08 **ppu1_ref_bits_tlu,
775
    S16 *pi2_ref_scf)
776
207k
{
777
207k
    search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778
207k
    search_node_t *ps_coloc;
779
207k
    search_node_t *ps_grid_cu_base;
780
207k
    CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781
782
    /* Part Start, Part sizes in 4x4 units */
783
207k
    S32 part_wd, part_ht, part_start_x, part_start_y;
784
785
    /* Partition type, number of partitions in type */
786
207k
    S32 part_id;
787
788
    /* Coordinates of the CU in 4x4 units */
789
207k
    S32 cu_start_x, cu_start_y;
790
207k
    S32 shift = e_cu_size;
791
792
    /* top right and bot left validity at CU level */
793
207k
    S32 cu_tr_valid, cu_bl_valid;
794
    /* strideo f the grid */
795
207k
    S32 grid_stride = ps_mv_grid->i4_stride;
796
797
207k
    ps_pred_ctxt->lambda = lambda;
798
207k
    ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799
207k
    ps_pred_ctxt->pred_lx = pred_lx;
800
207k
    ps_pred_ctxt->mv_pel = 0;
801
207k
    ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802
207k
    ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803
207k
    ps_pred_ctxt->proj_used = 1;
804
805
207k
    cu_start_x = ps_search_results->u1_x_off >> 2;
806
207k
    cu_start_y = ps_search_results->u1_y_off >> 2;
807
808
    /* Coloc always points to fixed global candt */
809
207k
    ps_coloc = ps_coloc_candts;
810
811
    /* Go to base of the CU in the MV Grid */
812
207k
    ps_grid_cu_base = &ps_mv_grid->as_node[0];
813
207k
    ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814
207k
    ps_grid_cu_base += (grid_stride * cu_start_y);
815
816
    /* points to the real bottom left of the grid, will never be valid */
817
207k
    ps_invalid = &ps_mv_grid->as_node[0];
818
207k
    ps_invalid += (grid_stride * 17);
819
820
207k
    {
821
207k
        S32 shift = 1 + e_cu_size;
822
207k
        cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823
207k
        cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824
207k
    }
825
826
    /*************************************************************************/
827
    /* for the case of    encode, the idea is to set up cants as follows     */
828
    /*                                                                       */
829
    /*    ____ ______________ ____ ____                                      */
830
    /*   | T0 | T1 | T2 | T3 | T4 | T5 |                                     */
831
    /*   |____|____|____|____|____|____|                                     */
832
    /*   | L1 |    |              |                                          */
833
    /*   |____|    |              |                                          */
834
    /*   | L2 | p0 |     p1       |                                          */
835
    /*   |____|    |              |                                          */
836
    /*   | L3 |    |              |                                          */
837
    /*   |____|    |              |                                          */
838
    /*   | L4 | L' |              |                                          */
839
    /*   |____|____|______________|                                          */
840
    /*   | BL |                                                              */
841
    /*   |____|                                                              */
842
    /*  The example is shown with 16x16 CU, though it can be generalized     */
843
    /*  This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition   */
844
    /*  width and ht in 4x4 units.                                           */
845
    /*  For a given CU, derive the top left, top and bottom left and top rt  */
846
    /*  pts. Left and top are assumed to be valid.                           */
847
    /*  IF there aretwo partitions in the CU (like p0 and p1) and vertical,  */
848
    /*  then for first partition, left, top, top left and top right valid    */
849
    /*  Bottom left is valid. store these validity flags. Also store the     */
850
    /*  grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851
    /*  Left grid offset = -1, 3. Top Grd offset = -1, 0.                    */
852
    /*  Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4.        */
853
    /*  For p1, validity flags are left, top, top left, top right, valid.    */
854
    /*  BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2)    */
855
    /*  TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care.                   */
856
    /*  For p1, set the left pred candt to the best search result of p0.     */
857
    /*************************************************************************/
858
859
    /* Loop over all partitions, and identify the 5 neighbours */
860
3.73M
    for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861
3.53M
    {
862
3.53M
        part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863
3.53M
        S32 tr_valid, bl_valid, is_vert;
864
3.53M
        search_node_t *ps_grid_pu_base;
865
3.53M
        PART_TYPE_T e_part_type;
866
3.53M
        PART_ID_T first_part;
867
3.53M
        S32 part_num;
868
869
3.53M
        e_part_type = ge_part_id_to_part_type[part_id];
870
3.53M
        first_part = ge_part_type_to_part_id[e_part_type][0];
871
3.53M
        is_vert = gau1_is_vert_part[e_part_type];
872
3.53M
        part_num = gau1_part_id_to_part_num[part_id];
873
3.53M
        tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874
3.53M
        bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875
876
3.53M
        part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877
3.53M
        part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878
3.53M
        part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879
3.53M
        part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880
881
        /* go to top left of part */
882
3.53M
        ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883
3.53M
        ps_grid_pu_base += (part_start_y * grid_stride);
884
885
3.53M
        ps_tl = ps_grid_pu_base - 1 - grid_stride;
886
3.53M
        ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887
3.53M
        ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888
3.53M
        ps_tr = ps_t + 1;
889
3.53M
        ps_bl = ps_l + grid_stride;
890
891
3.53M
        if(!tr_valid)
892
1.60M
            ps_tr = ps_invalid;
893
3.53M
        if(!bl_valid)
894
2.58M
            ps_bl = ps_invalid;
895
896
3.53M
        if(part_num == 1)
897
1.45M
        {
898
            /* for cases of two partitions 2nd part has 1st part as candt */
899
            /* if vertical type, left candt of 2nd part is 1st part.      */
900
            /* if horz type, top candt of 2nd part is 1st part.           */
901
1.45M
            if(is_vert)
902
830k
            {
903
830k
                ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904
830k
            }
905
623k
            else
906
623k
            {
907
623k
                ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908
623k
            }
909
1.45M
        }
910
3.53M
        if(part_num == 2)
911
207k
        {
912
            /* only possible for NxN_BL */
913
207k
            ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914
207k
            ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915
207k
        }
916
3.53M
        if(part_num == 3)
917
207k
        {
918
            /* only possible for NxN_BR */
919
207k
            ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920
207k
            ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921
207k
            ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922
207k
        }
923
3.53M
        hme_init_pred_part(
924
3.53M
            ps_pred_ctxt,
925
3.53M
            ps_tl,
926
3.53M
            ps_t,
927
3.53M
            ps_tr,
928
3.53M
            ps_l,
929
3.53M
            ps_bl,
930
3.53M
            ps_coloc,
931
3.53M
            ps_zeromv_candt,
932
3.53M
            NULL,
933
3.53M
            (PART_ID_T)part_id);
934
3.53M
    }
935
207k
}
936
937
/**
938
********************************************************************************
939
*  @fn     compute_mv_cost_explicit(search_node_t *ps_node,
940
*                   pred_ctxt_t *ps_pred_ctxt,
941
*                   PART_ID_T e_part_id)
942
*
943
*  @brief  MV cost for explicit search in layers not encoded
944
*
945
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
946
*
947
*  @param[in]  ps_pred_ctxt : mv pred context
948
*
949
*  @param[in]  e_part_id : Partition id.
950
*
951
*  @return   Cost value
952
953
********************************************************************************
954
*/
955
S32 compute_mv_cost_explicit(
956
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957
5.62M
{
958
5.62M
#define RETURN_FIXED_COST 0
959
5.62M
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960
5.62M
    pred_candt_nodes_t *ps_pred_nodes;
961
5.62M
    S32 inp_shift = 2 - inp_mv_pel;
962
5.62M
    S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963
5.62M
    S32 mv_p_x, mv_p_y;
964
5.62M
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
965
5.62M
    S32 cost, ref_bits;
966
967
    /*************************************************************************/
968
    /* Logic for cost computation for explicit search. For such a search,    */
969
    /* it is guaranteed that all predictor candts have same ref id. The only */
970
    /* probable issue is with the availability which needs checking. This fxn*/
971
    /* does not suffer the need to scale predictor candts due to diff ref id */
972
    /*************************************************************************/
973
974
    /* Hack: currently we always assume 2Nx2N. */
975
    /* TODO: get rid of this hack and return cost tuned to each partition */
976
5.62M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977
5.62M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978
979
    /*************************************************************************/
980
    /* Priority to bottom left availability. Else we go to left. If both are */
981
    /* not available, then a remains null                                    */
982
    /*************************************************************************/
983
5.62M
    if(ps_pred_nodes->ps_tl->u1_is_avail)
984
3.97M
        ps_pred_node_a = ps_pred_nodes->ps_tl;
985
1.64M
    else if(ps_pred_nodes->ps_l->u1_is_avail)
986
1.19M
        ps_pred_node_a = ps_pred_nodes->ps_l;
987
988
    /*************************************************************************/
989
    /* For encoder, top left may not be really needed unless we use slices,  */
990
    /* and even then in ME it may not be relevant. So we only consider T or  */
991
    /* TR, as, if both T and TR are not available, TL also will not be       */
992
    /*************************************************************************/
993
5.62M
    if(ps_pred_nodes->ps_tr->u1_is_avail)
994
3.97M
        ps_pred_node_b = ps_pred_nodes->ps_tr;
995
1.64M
    else if(ps_pred_nodes->ps_t->u1_is_avail)
996
354k
        ps_pred_node_b = ps_pred_nodes->ps_t;
997
998
5.62M
    if(ps_pred_node_a == NULL)
999
446k
    {
1000
446k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001
446k
        if(ps_pred_node_b == NULL)
1002
94.1k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003
446k
    }
1004
5.17M
    else if(ps_pred_node_b == NULL)
1005
1.19M
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006
3.97M
    else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007
1.99M
    {
1008
1.99M
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009
1.99M
    }
1010
1011
5.62M
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012
5.62M
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013
5.62M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014
5.62M
    mvdx1 = ABS(mvdx1);
1015
5.62M
    mvdy1 = ABS(mvdy1);
1016
1017
5.62M
    mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018
5.62M
    mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019
5.62M
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020
5.62M
    mvdx2 = ABS(mvdx2);
1021
5.62M
    mvdy2 = ABS(mvdy2);
1022
1023
5.62M
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024
1.88M
    {
1025
1.88M
        cost =
1026
1.88M
            hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027
1.88M
    }
1028
3.73M
    else
1029
3.73M
    {
1030
3.73M
        cost =
1031
3.73M
            hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032
3.73M
    }
1033
5.62M
    {
1034
5.62M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035
5.62M
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036
5.62M
    }
1037
5.62M
}
1038
/**
1039
********************************************************************************
1040
*  @fn     compute_mv_cost_coarse(search_node_t *ps_node,
1041
*                   pred_ctxt_t *ps_pred_ctxt,
1042
*                   PART_ID_T e_part_id)
1043
*
1044
*  @brief  MV cost for coarse explicit search in coarsest layer
1045
*
1046
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1047
*
1048
*  @param[in]  ps_pred_ctxt : mv pred context
1049
*
1050
*  @param[in]  e_part_id : Partition id.
1051
*
1052
*  @return   Cost value
1053
1054
********************************************************************************
1055
*/
1056
S32 compute_mv_cost_coarse(
1057
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058
5.62M
{
1059
5.62M
    ARG_NOT_USED(e_part_id);
1060
1061
5.62M
    return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062
5.62M
}
1063
1064
/**
1065
********************************************************************************
1066
*  @fn     compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067
*                                            pred_ctxt_t *ps_pred_ctxt,
1068
*                                            PART_ID_T e_part_id)
1069
*
1070
*  @brief  MV cost for coarse explicit search in coarsest layer
1071
*
1072
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1073
*
1074
*  @param[in]  ps_pred_ctxt : mv pred context
1075
*
1076
*  @param[in]  e_part_id : Partition id.
1077
*
1078
*  @return   Cost value
1079
1080
********************************************************************************
1081
*/
1082
S32 compute_mv_cost_coarse_high_speed(
1083
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084
43.7M
{
1085
43.7M
    S32 rnd, mvx, mvy, i4_search_idx;
1086
43.7M
    S32 cost;
1087
1088
43.7M
    mvx = ps_node->s_mv.i2_mvx;
1089
43.7M
    mvy = ps_node->s_mv.i2_mvy;
1090
43.7M
    i4_search_idx = ps_node->i1_ref_idx;
1091
1092
43.7M
    cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093
43.7M
    cost += (mvx != 0) ? 1 : 0;
1094
43.7M
    cost += (mvy != 0) ? 1 : 0;
1095
43.7M
    rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096
43.7M
    cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097
43.7M
    return cost;
1098
43.7M
}
1099
1100
/**
1101
********************************************************************************
1102
*  @fn     compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103
*                                          pred_ctxt_t *ps_pred_ctxt,
1104
*                                          PART_ID_T e_part_id)
1105
*
1106
*  @brief  MV cost for explicit search in layers not encoded. Always returns
1107
*          cost of the projected colocated candidate
1108
*
1109
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1110
*
1111
*  @param[in]  ps_pred_ctxt : mv pred context
1112
*
1113
*  @param[in]  e_part_id : Partition id.
1114
*
1115
*  @return   Cost value
1116
1117
********************************************************************************
1118
*/
1119
S32 compute_mv_cost_explicit_refine(
1120
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121
18.9M
{
1122
18.9M
    search_node_t *ps_pred_node_a = NULL;
1123
18.9M
    pred_candt_nodes_t *ps_pred_nodes;
1124
18.9M
    S32 inp_shift = 2 - inp_mv_pel;
1125
18.9M
    S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126
18.9M
    S32 mv_p_x, mv_p_y;
1127
18.9M
    S16 mvdx1, mvdy1;
1128
18.9M
    S32 cost, ref_bits;
1129
1130
18.9M
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131
18.9M
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132
1133
18.9M
    ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134
1135
18.9M
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136
18.9M
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137
18.9M
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138
18.9M
    mvdx1 = ABS(mvdx1);
1139
18.9M
    mvdy1 = ABS(mvdy1);
1140
1141
18.9M
    cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142
1143
18.9M
    {
1144
18.9M
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145
18.9M
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146
18.9M
    }
1147
18.9M
}
1148
1149
/**
1150
********************************************************************************
1151
*  @fn     compute_mv_cost_refine(search_node_t *ps_node,
1152
*                   pred_ctxt_t *ps_pred_ctxt,
1153
*                   PART_ID_T e_part_id)
1154
*
1155
*  @brief  MV cost for coarse explicit search in coarsest layer
1156
*
1157
*  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1158
*
1159
*  @param[in]  ps_pred_ctxt : mv pred context
1160
*
1161
*  @param[in]  e_part_id : Partition id.
1162
*
1163
*  @return   Cost value
1164
1165
********************************************************************************
1166
*/
1167
S32 compute_mv_cost_refine(
1168
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169
18.9M
{
1170
18.9M
    return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171
18.9M
}
1172
1173
S32 compute_mv_cost_implicit(
1174
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175
0
{
1176
0
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177
0
    pred_candt_nodes_t *ps_pred_nodes;
1178
0
    S08 i1_ref_idx;
1179
0
    S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180
0
    S08 i1_ref_bl = -1, i1_ref_l = -1;
1181
0
    S32 inp_shift = 2 - inp_mv_pel;
1182
0
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183
0
    S32 ref_bits, cost;
1184
0
    S32 mv_p_x, mv_p_y;
1185
0
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186
1187
    //return 0;
1188
0
    i1_ref_idx = ps_node->i1_ref_idx;
1189
1190
    /*************************************************************************/
1191
    /* Logic for cost computation for explicit search. For such a search,    */
1192
    /* it is guaranteed that all predictor candts have same ref id. The only */
1193
    /* probable issue is with the availability which needs checking. This fxn*/
1194
    /* does not suffer the need to scale predictor candts due to diff ref id */
1195
    /*************************************************************************/
1196
1197
0
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198
0
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199
1200
    /*************************************************************************/
1201
    /* Priority to bottom left availability. Else we go to left. If both are */
1202
    /* not available, then a remains null                                    */
1203
    /*************************************************************************/
1204
0
    if(ps_pred_nodes->ps_bl->u1_is_avail)
1205
0
        i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206
0
    if(ps_pred_nodes->ps_l->u1_is_avail)
1207
0
        i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208
0
    if(i1_ref_bl == i1_ref_idx)
1209
0
        ps_pred_node_a = ps_pred_nodes->ps_bl;
1210
0
    else if(i1_ref_l == i1_ref_idx)
1211
0
        ps_pred_node_a = ps_pred_nodes->ps_l;
1212
0
    if(ps_pred_node_a == NULL)
1213
0
    {
1214
0
        if(i1_ref_bl != -1)
1215
0
            ps_pred_node_a = ps_pred_nodes->ps_bl;
1216
0
        else if(i1_ref_l != -1)
1217
0
            ps_pred_node_a = ps_pred_nodes->ps_l;
1218
0
    }
1219
1220
    /*************************************************************************/
1221
    /* For encoder, top left may not be really needed unless we use slices,  */
1222
    /* and even then in ME it may not be relevant. So we only consider T or  */
1223
    /* TR, as, if both T and TR are not available, TL also will not be       */
1224
    /*************************************************************************/
1225
0
    if(ps_pred_nodes->ps_tr->u1_is_avail)
1226
0
        i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227
0
    if(ps_pred_nodes->ps_t->u1_is_avail)
1228
0
        i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229
0
    if(ps_pred_nodes->ps_tl->u1_is_avail)
1230
0
        i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231
0
    if(i1_ref_tr == i1_ref_idx)
1232
0
        ps_pred_node_b = ps_pred_nodes->ps_tr;
1233
0
    else if(i1_ref_t == i1_ref_idx)
1234
0
        ps_pred_node_b = ps_pred_nodes->ps_t;
1235
0
    else if(i1_ref_tl == i1_ref_idx)
1236
0
        ps_pred_node_b = ps_pred_nodes->ps_tl;
1237
1238
0
    if(ps_pred_node_b == NULL)
1239
0
    {
1240
0
        if(i1_ref_tr != -1)
1241
0
            ps_pred_node_b = ps_pred_nodes->ps_tr;
1242
0
        else if(i1_ref_t != -1)
1243
0
            ps_pred_node_b = ps_pred_nodes->ps_t;
1244
0
        else if(i1_ref_tl != -1)
1245
0
            ps_pred_node_b = ps_pred_nodes->ps_tl;
1246
0
    }
1247
0
    if(ps_pred_node_a == NULL)
1248
0
    {
1249
0
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250
0
        if(ps_pred_node_b == NULL)
1251
0
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252
0
    }
1253
0
    else if(ps_pred_node_b == NULL)
1254
0
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255
0
    else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256
0
    {
1257
0
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258
0
    }
1259
1260
0
    if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261
0
    {
1262
0
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263
0
    }
1264
0
    else
1265
0
    {
1266
0
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267
0
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268
0
    }
1269
0
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270
0
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271
0
    mvdx1 = ABS(mvdx1);
1272
0
    mvdy1 = ABS(mvdy1);
1273
1274
0
    if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275
0
    {
1276
0
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277
0
    }
1278
0
    else
1279
0
    {
1280
0
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281
0
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282
0
    }
1283
0
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284
0
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285
0
    mvdx2 = ABS(mvdx2);
1286
0
    mvdy2 = ABS(mvdy2);
1287
1288
0
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289
0
    {
1290
0
        cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291
0
               2 * (mvdy1 > 0) + ref_bits + 2;
1292
0
    }
1293
0
    else
1294
0
    {
1295
0
        cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296
0
               2 * (mvdy2 > 0) + ref_bits + 2;
1297
0
    }
1298
0
    {
1299
        /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300
0
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301
0
        S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302
1303
0
        tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304
0
        return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305
0
    }
1306
0
}
1307
1308
S32 compute_mv_cost_implicit_high_speed(
1309
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310
444k
{
1311
444k
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312
444k
    pred_candt_nodes_t *ps_pred_nodes;
1313
444k
    S08 i1_ref_idx;
1314
444k
    S08 i1_ref_tr = -1;
1315
444k
    S08 i1_ref_l = -1;
1316
444k
    S32 inp_shift = 2 - inp_mv_pel;
1317
444k
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318
444k
    S32 ref_bits, cost;
1319
444k
    S32 mv_p_x, mv_p_y;
1320
444k
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321
1322
444k
    i1_ref_idx = ps_node->i1_ref_idx;
1323
1324
444k
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325
444k
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326
1327
    /*************************************************************************/
1328
    /* Priority to bottom left availability. Else we go to left. If both are */
1329
    /* not available, then a remains null                                    */
1330
    /*************************************************************************/
1331
444k
    if(ps_pred_nodes->ps_l->u1_is_avail)
1332
373k
    {
1333
373k
        i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334
373k
        ps_pred_node_a = ps_pred_nodes->ps_l;
1335
373k
    }
1336
1337
    /*************************************************************************/
1338
    /* For encoder, top left may not be really needed unless we use slices,  */
1339
    /* and even then in ME it may not be relevant. So we only consider T or  */
1340
    /* TR, as, if both T and TR are not available, TL also will not be       */
1341
    /*************************************************************************/
1342
1343
444k
    if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344
96.1k
    {
1345
96.1k
        i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346
96.1k
        ps_pred_node_b = ps_pred_nodes->ps_tr;
1347
96.1k
    }
1348
348k
    else
1349
348k
    {
1350
348k
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351
348k
    }
1352
1353
444k
    if(ps_pred_node_a == NULL)
1354
70.8k
    {
1355
70.8k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356
1357
70.8k
        if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358
48.7k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359
70.8k
    }
1360
1361
444k
    if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362
50.7k
    {
1363
50.7k
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364
50.7k
    }
1365
393k
    else
1366
393k
    {
1367
393k
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368
393k
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369
393k
    }
1370
1371
444k
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372
444k
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373
444k
    mvdx1 = ABS(mvdx1);
1374
444k
    mvdy1 = ABS(mvdy1);
1375
1376
444k
    if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377
104k
    {
1378
104k
        SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379
104k
    }
1380
339k
    else
1381
339k
    {
1382
339k
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383
339k
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384
339k
    }
1385
1386
444k
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387
444k
    COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388
444k
    mvdx2 = ABS(mvdx2);
1389
444k
    mvdy2 = ABS(mvdy2);
1390
1391
444k
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392
138k
    {
1393
138k
        cost =
1394
138k
            hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395
138k
    }
1396
305k
    else
1397
305k
    {
1398
305k
        cost =
1399
305k
            hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400
305k
    }
1401
444k
    {
1402
        /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403
444k
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404
444k
        S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405
1406
444k
        return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407
444k
    }
1408
444k
}
1409
1410
S32 compute_mv_cost_implicit_high_speed_modified(
1411
    search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412
0
{
1413
0
    search_node_t *ps_pred_node_a = NULL;
1414
0
    pred_candt_nodes_t *ps_pred_nodes;
1415
0
    S32 inp_shift = 2 - inp_mv_pel;
1416
0
    S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417
0
    S32 mv_p_x, mv_p_y;
1418
0
    S16 mvdx1, mvdy1;
1419
0
    S32 cost, ref_bits;
1420
1421
0
    ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422
0
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423
1424
0
    ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425
1426
0
    mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427
0
    mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428
0
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429
0
    COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430
0
    mvdx1 = ABS(mvdx1);
1431
0
    mvdy1 = ABS(mvdy1);
1432
1433
0
    cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434
1435
0
    {
1436
0
        S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437
0
        return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438
0
    }
1439
0
}
1440
1441
void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442
249k
{
1443
    /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444
1445
249k
    search_node_t s_search_node_grid;
1446
249k
    const search_node_t *ps_search_node_base;
1447
249k
    search_node_t *ps_search_node_grid, *ps_best_node;
1448
249k
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449
249k
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450
249k
    search_results_t *ps_search_results;
1451
249k
    S32 *pi4_valid_part_ids;
1452
249k
    S32 i4_step = ps_result_prms->i4_step;
1453
249k
    S32 i4_grid_mask, i, i4_min_id;
1454
249k
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455
249k
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456
249k
    S32 grid_count = 0;
1457
249k
    S32 pred_lx;
1458
1459
249k
    i4_min_id = (S32)PT_C;
1460
249k
    i4_min_cost = MAX_32BIT_VAL;
1461
249k
    ps_search_node_grid = &s_search_node_grid;
1462
249k
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1463
249k
    *ps_search_node_grid = *ps_search_node_base;
1464
249k
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465
249k
    ps_search_results = ps_result_prms->ps_search_results;
1466
249k
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1467
249k
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1468
1469
2.49M
    for(i = 0; i < 9; i++)
1470
2.24M
    {
1471
2.24M
        if(i4_grid_mask & (1 << i))
1472
2.09M
            grid_count++;
1473
2.24M
    }
1474
1475
    /* Some basic assumptions: only single pt, only part updates */
1476
    /* and more than 1 best result to be computed.               */
1477
    //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478
    //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479
    //ASSERT(ps_search_results->num_results > 1);
1480
1481
249k
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482
249k
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483
1484
    /*************************************************************************/
1485
    /* Supposing we do hte result update for a unique partid, we can */
1486
    /* store the best pt id in the grid and also min cost is return */
1487
    /* param. This will be useful for early exit cases.             */
1488
    /* TODO : once we have separate fxn for unique part+grid, we can */
1489
    /* do away with this code here                                   */
1490
    /*************************************************************************/
1491
    //if (pi4_valid_part_ids[1] == -1)
1492
249k
    i4_unique_id = pi4_valid_part_ids[0];
1493
1494
    /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495
    /* this till we encounter -1. This is easier than having to       */
1496
    /* figure out part by part, besides, active part decision is      */
1497
    /* usually fixed for a given duration of search, e.g. entire fpel */
1498
    /* refinement for a blk/cu will use fixed valid part mask         */
1499
249k
    id = pi4_valid_part_ids[0];
1500
1501
    /*****************************************************************/
1502
    /* points to the best search results corresponding to this       */
1503
    /* specific part type.                                           */
1504
    /*****************************************************************/
1505
249k
    ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506
1507
    /*************************************************************************/
1508
    /* Outer loop runs through all active pts in the grid                    */
1509
    /*************************************************************************/
1510
2.49M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511
2.24M
    {
1512
2.24M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513
152k
            continue;
1514
1515
        /* For the pt in the grid, update mvx and y depending on */
1516
        /* location of pt. Updates are in FPEL units.            */
1517
2.09M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518
2.09M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519
2.09M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520
2.09M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521
1522
2.09M
        {
1523
            /* evaluate mv cost and totalcost for this part for this given mv*/
1524
2.09M
            i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525
2.09M
                ps_search_node_grid,
1526
2.09M
                &ps_search_results->as_pred_ctxt[pred_lx],
1527
2.09M
                (PART_ID_T)id,
1528
2.09M
                MV_RES_FPEL);
1529
1530
2.09M
            i4_sad = pi4_sad_grid[grid_count * id];
1531
2.09M
            i4_tot_cost = i4_sad + i4_mv_cost;
1532
1533
2.09M
            ASSERT(i4_unique_id == id);
1534
2.09M
            ASSERT(num_results == 1);
1535
1536
            /*****************************************************************/
1537
            /* We do not labor through the results if the total cost worse   */
1538
            /* than the last of the results.                                 */
1539
            /*****************************************************************/
1540
2.09M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541
160k
            {
1542
160k
                i4_min_id = i4_grid_pt;
1543
160k
                ps_result_prms->i4_min_cost = i4_tot_cost;
1544
1545
160k
                ps_best_node[0] = *ps_search_node_grid;
1546
160k
                ps_best_node[0].i4_sad = i4_sad;
1547
160k
                ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548
160k
                ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549
160k
            }
1550
2.09M
        }
1551
2.09M
        pi4_sad_grid++;
1552
2.09M
    }
1553
249k
    ps_result_prms->i4_min_id = i4_min_id;
1554
249k
}
1555
1556
void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557
4.02M
{
1558
4.02M
    search_node_t s_search_node_grid;
1559
4.02M
    const search_node_t *ps_search_node_base;
1560
4.02M
    search_node_t *ps_search_node_grid, *ps_best_node;
1561
4.02M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562
4.02M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563
4.02M
    search_results_t *ps_search_results;
1564
4.02M
    S32 *pi4_valid_part_ids;
1565
4.02M
    S32 i4_step = ps_result_prms->i4_step;
1566
4.02M
    S32 i4_grid_mask, i4_count, i, i4_min_id;
1567
4.02M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568
4.02M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569
4.02M
    S32 grid_count = 0;
1570
4.02M
    S32 pred_lx;
1571
1572
4.02M
    i4_min_id = (S32)PT_C;
1573
4.02M
    i4_min_cost = MAX_32BIT_VAL;
1574
4.02M
    ps_search_node_grid = &s_search_node_grid;
1575
4.02M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1576
4.02M
    *ps_search_node_grid = *ps_search_node_base;
1577
4.02M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578
4.02M
    ps_search_results = ps_result_prms->ps_search_results;
1579
4.02M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1580
4.02M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1581
1582
40.2M
    for(i = 0; i < 9; i++)
1583
36.2M
    {
1584
36.2M
        if(i4_grid_mask & (1 << i))
1585
19.5M
        {
1586
19.5M
            grid_count++;
1587
19.5M
        }
1588
36.2M
    }
1589
1590
4.02M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591
4.02M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592
1593
4.02M
    i4_unique_id = pi4_valid_part_ids[0];
1594
1595
    /*************************************************************************/
1596
    /* Outer loop runs through all active pts in the grid                    */
1597
    /*************************************************************************/
1598
40.2M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599
36.2M
    {
1600
36.2M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601
16.6M
        {
1602
16.6M
            continue;
1603
16.6M
        }
1604
1605
        /* For the pt in the grid, update mvx and y depending on */
1606
        /* location of pt. Updates are in FPEL units.            */
1607
19.5M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608
19.5M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609
19.5M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610
19.5M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611
1612
19.5M
        i4_count = 0;
1613
1614
39.1M
        while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615
19.5M
        {
1616
            /*****************************************************************/
1617
            /* points to the best search results corresponding to this       */
1618
            /* specific part type.                                           */
1619
            /*****************************************************************/
1620
19.5M
            ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621
1622
            /* evaluate mv cost and totalcost for this part for this given mv*/
1623
19.5M
            i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624
19.5M
                ps_search_node_grid,
1625
19.5M
                &ps_search_results->as_pred_ctxt[pred_lx],
1626
19.5M
                (PART_ID_T)id,
1627
19.5M
                MV_RES_FPEL);
1628
1629
19.5M
            i4_sad = pi4_sad_grid[grid_count * id];
1630
19.5M
            i4_tot_cost = i4_sad + i4_mv_cost;
1631
1632
19.5M
            if(i4_unique_id == id)
1633
19.5M
            {
1634
19.5M
                if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635
4.64M
                {
1636
4.64M
                    i4_min_id = i4_grid_pt;
1637
4.64M
                    ps_result_prms->i4_min_cost = i4_tot_cost;
1638
4.64M
                }
1639
19.5M
            }
1640
1641
19.5M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642
1.09M
            {
1643
1.09M
                for(i = 0; i < num_results - 1; i++)
1644
0
                {
1645
0
                    if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646
0
                    {
1647
0
                        memmove(
1648
0
                            ps_best_node + i + 1,
1649
0
                            ps_best_node + i,
1650
0
                            sizeof(search_node_t) * (num_results - 1 - i));
1651
0
                        break;
1652
0
                    }
1653
0
                    else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654
0
                    {
1655
0
                        if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656
0
                            break;
1657
0
                    }
1658
0
                }
1659
1.09M
                ps_best_node[i] = *ps_search_node_grid;
1660
1.09M
                ps_best_node[i].i4_sad = i4_sad;
1661
1.09M
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662
1.09M
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663
1.09M
            }
1664
19.5M
            i4_count++;
1665
19.5M
        }
1666
19.5M
        pi4_sad_grid++;
1667
19.5M
    }
1668
4.02M
    ps_result_prms->i4_min_id = i4_min_id;
1669
4.02M
}
1670
1671
/**
1672
********************************************************************************
1673
*  @fn     hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674
*
1675
*  @brief  Updates results for the case where 1 best result is to be updated
1676
*          for a given pt, for several parts
1677
*          Note : The function is replicated for CLIPing the cost to 16bit to make
1678
*                  bit match with SIMD version
1679
*
1680
*  @param[in]  result_upd_prms_t : Contains the input parameters to this fxn
1681
*
1682
*  @return   The result_upd_prms_t structure is updated for all the active
1683
*            parts in case the current candt has results for any given part
1684
*             that is the best result for that part
1685
********************************************************************************
1686
*/
1687
void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688
6.88M
{
1689
6.88M
    search_node_t s_search_node_grid;
1690
6.88M
    const search_node_t *ps_search_node_base;
1691
6.88M
    search_node_t *ps_search_node_grid, *ps_best_node;
1692
6.88M
    S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693
6.88M
    S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694
6.88M
    search_results_t *ps_search_results;
1695
6.88M
    S32 *pi4_valid_part_ids;
1696
6.88M
    S32 i4_step = ps_result_prms->i4_step;
1697
6.88M
    S32 i4_grid_mask, i4_count, i, i4_min_id;
1698
6.88M
    S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699
6.88M
    S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700
6.88M
    S32 grid_count = 0;
1701
6.88M
    S32 pred_lx;
1702
1703
6.88M
    i4_min_id = (S32)PT_C;
1704
6.88M
    i4_min_cost = MAX_32BIT_VAL;
1705
6.88M
    ps_search_node_grid = &s_search_node_grid;
1706
6.88M
    ps_search_node_base = ps_result_prms->ps_search_node_base;
1707
6.88M
    *ps_search_node_grid = *ps_search_node_base;
1708
6.88M
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709
6.88M
    ps_search_results = ps_result_prms->ps_search_results;
1710
6.88M
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1711
6.88M
    i4_grid_mask = ps_result_prms->i4_grid_mask;
1712
1713
68.8M
    for(i = 0; i < 9; i++)
1714
61.9M
    {
1715
61.9M
        if(i4_grid_mask & (1 << i))
1716
6.88M
            grid_count++;
1717
61.9M
    }
1718
1719
    /* Some basic assumptions: only single pt, only part updates */
1720
    /* and more than 1 best result to be computed.               */
1721
1722
6.88M
    i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723
6.88M
    pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724
1725
    /*************************************************************************/
1726
    /* Supposing we do hte result update for a unique partid, we can */
1727
    /* store the best pt id in the grid and also min cost is return */
1728
    /* param. This will be useful for early exit cases.             */
1729
    /* TODO : once we have separate fxn for unique part+grid, we can */
1730
    /* do away with this code here                                   */
1731
    /*************************************************************************/
1732
    //if (pi4_valid_part_ids[1] == -1)
1733
6.88M
    i4_unique_id = pi4_valid_part_ids[0];
1734
1735
    /*************************************************************************/
1736
    /* Outer loop runs through all active pts in the grid                    */
1737
    /*************************************************************************/
1738
68.8M
    for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739
61.9M
    {
1740
61.9M
        if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741
55.0M
            continue;
1742
1743
        /* For the pt in the grid, update mvx and y depending on */
1744
        /* location of pt. Updates are in FPEL units.            */
1745
6.88M
        ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746
6.88M
        ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747
6.88M
        ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748
6.88M
        ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749
1750
6.88M
        i4_count = 0;
1751
1752
        /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753
        /* this till we encounter -1. This is easier than having to       */
1754
        /* figure out part by part, besides, active part decision is      */
1755
        /* usually fixed for a given duration of search, e.g. entire fpel */
1756
        /* refinement for a blk/cu will use fixed valid part mask         */
1757
1758
25.7M
        while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759
18.9M
        {
1760
            //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761
1762
            /*****************************************************************/
1763
            /* points to the best search results corresponding to this       */
1764
            /* specific part type.                                           */
1765
            /*****************************************************************/
1766
18.9M
            ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767
1768
            /* evaluate mv cost and totalcost for this part for this given mv*/
1769
18.9M
            i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770
18.9M
                ps_search_node_grid,
1771
18.9M
                &ps_search_results->as_pred_ctxt[pred_lx],
1772
18.9M
                (PART_ID_T)id,
1773
18.9M
                MV_RES_FPEL);
1774
1775
18.9M
            i4_sad = pi4_sad_grid[grid_count * id];
1776
1777
            /* Clipping to 16 bit to bit match with SIMD version */
1778
18.9M
            i4_mv_cost = CLIP_S16(i4_mv_cost);
1779
18.9M
            i4_sad = CLIP_S16(i4_sad);
1780
1781
18.9M
            i4_tot_cost = i4_sad + i4_mv_cost;
1782
            /* Clipping to 16 bit to bit match with SIMD version */
1783
18.9M
            i4_tot_cost = CLIP_S16(i4_tot_cost);
1784
1785
18.9M
            if(i4_unique_id == id)
1786
6.88M
            {
1787
6.88M
                if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788
6.88M
                {
1789
6.88M
                    i4_min_id = i4_grid_pt;
1790
6.88M
                    ps_result_prms->i4_min_cost = i4_tot_cost;
1791
6.88M
                }
1792
6.88M
            }
1793
1794
            /*****************************************************************/
1795
            /* We do not labor through the results if the total cost worse   */
1796
            /* than the last of the results.                                 */
1797
            /*****************************************************************/
1798
18.9M
            if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799
3.84M
            {
1800
                /*************************************************************/
1801
                /* Identify where the current result isto be placed.Basically*/
1802
                /* find the node which has cost just higher thannodeundertest*/
1803
                /*************************************************************/
1804
4.51M
                for(i = 0; i < num_results - 1; i++)
1805
2.21M
                {
1806
2.21M
                    if(i4_tot_cost <= ps_best_node[i].i4_tot_cost)
1807
1.54M
                    {
1808
1.54M
                        memmove(
1809
1.54M
                            ps_best_node + i + 1,
1810
1.54M
                            ps_best_node + i,
1811
1.54M
                            sizeof(search_node_t) * (num_results - 1 - i));
1812
1.54M
                        break;
1813
1.54M
                    }
1814
2.21M
                }
1815
3.84M
                ps_best_node[i] = *ps_search_node_grid;
1816
3.84M
                ps_best_node[i].i4_sad = i4_sad;
1817
3.84M
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1818
3.84M
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1819
3.84M
            }
1820
18.9M
            i4_count++;
1821
18.9M
        }
1822
6.88M
        pi4_sad_grid++;
1823
6.88M
    }
1824
6.88M
    ps_result_prms->i4_min_id = i4_min_id;
1825
6.88M
}
1826
1827
/**
1828
********************************************************************************
1829
*  @fn     hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1830
*
1831
*  @brief  Updates results for the case where 1 best result is to be updated
1832
*          for a given pt, for several parts
1833
*
1834
*  @param[in]  ps_result_prms. Contains the input parameters to this fxn
1835
*              ::ps_pred_info : contains cost fxn ptr and predictor info
1836
*              ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1837
*              ::ps_search_results: Search results structure
1838
*              ::i1_ref_id : Reference index
1839
*              ::i4_grid_mask: Dont Care for this fxn
1840
*              ::pi4_valid_part_ids : valid part ids
1841
*              ::ps_search_node_base: Contains the centre pt candt info.
1842
*
1843
*  @return   The ps_search_results structure is updated for all the active
1844
*            parts in case the current candt has results for any given part
1845
*             that is the best result for that part
1846
********************************************************************************
1847
*/
1848
1849
void hme_update_results_pt_pu_best1_subpel_hs(
1850
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1851
130k
{
1852
130k
    search_node_t *ps_search_node_base, *ps_best_node;
1853
130k
    search_results_t *ps_search_results;
1854
130k
    S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1855
130k
    S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1856
130k
    S32 num_results, i;
1857
130k
    S32 *pi4_valid_part_ids;
1858
1859
130k
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1860
    /* Some basic assumptions: only single pt, only part updates */
1861
    /* and more than 1 best result to be computed.               */
1862
130k
    ASSERT(ps_result_prms->i4_grid_mask == 1);
1863
1864
130k
    ps_search_results = ps_result_prms->ps_search_results;
1865
130k
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1866
1867
    /* Compute mv cost, total cost */
1868
130k
    ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1869
1870
1.25M
    while((id = pi4_valid_part_ids[i4_count]) >= 0)
1871
1.12M
    {
1872
1.12M
        S32 update_required = 1;
1873
1874
1.12M
        ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1875
        /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1876
1.12M
        i4_mv_cost = ps_best_node->i4_mv_cost;
1877
1.12M
        i4_sad = ps_result_prms->pi4_sad_grid[id];
1878
1.12M
        i4_tot_cost = i4_sad + i4_mv_cost;
1879
1880
        /* We do not labor through the results if the total cost is worse than   */
1881
        /* the last of the results.                                              */
1882
1.12M
        if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1883
574k
        {
1884
            /* Identify where the current result is to be placed. Basically find  */
1885
            /* the node which has cost just higher than node under test           */
1886
574k
            for(i = 0; i < num_results - 1; i++)
1887
0
            {
1888
0
                if(ps_best_node[i].i1_ref_idx != -1)
1889
0
                {
1890
0
                    if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1891
0
                    {
1892
0
                        memmove(
1893
0
                            ps_best_node + i + 1,
1894
0
                            ps_best_node + i,
1895
0
                            sizeof(search_node_t) * (num_results - 1 - i));
1896
0
                        break;
1897
0
                    }
1898
0
                    else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1899
0
                    {
1900
0
                        update_required = 0;
1901
0
                        break;
1902
0
                    }
1903
0
                }
1904
0
                else
1905
0
                {
1906
0
                    break;
1907
0
                }
1908
0
            }
1909
1910
574k
            if(update_required)
1911
574k
            {
1912
                /* Update when either ref_idx or mv's are different */
1913
574k
                ps_best_node[i] = *ps_search_node_base;
1914
574k
                ps_best_node[i].i4_sad = i4_sad;
1915
574k
                ps_best_node[i].i4_mv_cost = i4_mv_cost;
1916
574k
                ps_best_node[i].i4_tot_cost = i4_tot_cost;
1917
574k
            }
1918
574k
        }
1919
1.12M
        i4_count++;
1920
1.12M
    }
1921
130k
}
1922
1923
void hme_update_results_pt_pu_best1_subpel_hs_1(
1924
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1925
0
{
1926
0
    search_node_t *ps_search_node_base, *ps_best_node;
1927
0
    search_results_t *ps_search_results;
1928
0
    S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1929
0
    S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1930
0
    S32 num_results;
1931
0
    S32 *pi4_valid_part_ids;
1932
1933
0
    pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1934
    /* Some basic assumptions: only single pt, only part updates */
1935
    /* and more than 1 best result to be computed.               */
1936
0
    ASSERT(ps_result_prms->i4_grid_mask == 1);
1937
1938
0
    ps_search_results = ps_result_prms->ps_search_results;
1939
0
    num_results = (S32)ps_search_results->u1_num_results_per_part;
1940
1941
    /* Compute mv cost, total cost */
1942
0
    ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1943
1944
0
    while((id = pi4_valid_part_ids[i4_count]) >= 0)
1945
0
    {
1946
0
        S32 update_required = 0;
1947
1948
0
        ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1949
        /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1950
0
        i4_mv_cost = ps_best_node->i4_mv_cost;
1951
0
        i4_sad = ps_result_prms->pi4_sad_grid[id];
1952
0
        i4_tot_cost = i4_sad + i4_mv_cost;
1953
1954
        /* We do not labor through the results if the total cost is worse than   */
1955
        /* the last of the results.                                              */
1956
0
        if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1957
0
        {
1958
0
            S32 sdi_value = 0;
1959
1960
0
            update_required = 2;
1961
            /* Identify where the current result is to be placed. Basically find  */
1962
            /* the node which has cost just higher than node under test           */
1963
0
            {
1964
0
                if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1965
0
                {
1966
0
                    update_required = 1;
1967
0
                    sdi_value = ps_best_node[0].i4_sad - i4_sad;
1968
0
                }
1969
0
                else if(
1970
0
                    (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1971
0
                    (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1972
0
                    (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1973
0
                {
1974
0
                    update_required = 0;
1975
0
                }
1976
0
            }
1977
0
            if(update_required == 2)
1978
0
            {
1979
0
                subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1980
1981
0
                ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1982
0
                ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1983
0
                ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1984
0
                ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1985
0
                ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
1986
0
            }
1987
0
            else if(update_required == 1)
1988
0
            {
1989
0
                subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1990
1991
0
                ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
1992
0
                    ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
1993
0
                ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
1994
0
                    ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
1995
0
                ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
1996
0
                    ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
1997
0
                ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
1998
0
                    ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
1999
0
                ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2000
0
                    ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2001
2002
0
                ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2003
0
                ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2004
0
                ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2005
0
                ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2006
0
                ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2007
0
            }
2008
0
        }
2009
0
        i4_count++;
2010
0
    }
2011
0
}
2012
2013
/**
2014
******************************************************************************
2015
*  @brief Gives a result fxn ptr for a index [x] where x is as:
2016
*         0 : single pt, no partial updates, 1 best result
2017
*         1 : single pt, no partial updates, N best results
2018
*         2 : single pt,    partial updates, 1 best result
2019
*         3 : single pt,    partial updates, N best results
2020
*         0 : grid     , no partial updates, 1 best result
2021
*         1 : grid     , no partial updates, N best results
2022
*         2 : grid     ,    partial updates, 1 best result
2023
*         3 : grid     ,    partial updates, N best results
2024
******************************************************************************
2025
*/
2026
2027
static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1,   UPD_RES_PT_NPU_BESTN,
2028
                                              UPD_RES_PT_PU_BEST1,    UPD_RES_PT_PU_BESTN,
2029
                                              UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2030
                                              UPD_RES_GRID_PU_BEST1,  UPD_RES_GRID_PU_BESTN };
2031
2032
/**
2033
********************************************************************************
2034
*  @fn     hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2035
*
2036
*  @brief  Obtains the suitable result function that evaluates COST and also
2037
*           computes one or more best results for point/grid, single part or
2038
*           more than one part.
2039
*
2040
*  @param[in]  i4_grid_mask : Mask containing which of 9 grid pts active
2041
*
2042
*  @param[in]  i4_part_mask : Mask containing which of the 17 parts active
2043
*
2044
*  @param[in]  i4_num_results: Number of active results
2045
*
2046
*  @return   Pointer to the appropriate result update function
2047
********************************************************************************
2048
*/
2049
PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2050
2.06M
{
2051
2.06M
    S32 i4_is_grid = (i4_grid_mask != 1);
2052
2.06M
    S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2053
2.06M
    S32 i4_res_gt1 = (i4_num_results > 1);
2054
2.06M
    S32 id;
2055
2056
2.06M
    id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2057
2058
2.06M
    return (g_pf_result_fxn[id]);
2059
2.06M
}
2060
2061
void hme_calc_sad_and_2_best_results(
2062
    hme_search_prms_t *ps_search_prms,
2063
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2064
    err_prms_t *ps_err_prms,
2065
    result_upd_prms_t *ps_result_prms,
2066
    U08 **ppu1_ref,
2067
    S32 i4_ref_stride)
2068
0
{
2069
0
    S32 i4_candt;
2070
0
    S32 i4_inp_off;
2071
0
    S32 i4_ref_offset;
2072
0
    S32 i4_num_nodes;
2073
2074
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2075
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2076
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2077
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2078
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2079
2080
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2081
0
    search_node_t *ps_search_node;
2082
2083
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2084
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2085
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2086
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2087
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2088
0
    ps_search_node = ps_search_prms->ps_search_nodes;
2089
2090
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2091
0
    {
2092
        /**********************************************************************/
2093
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2094
        /**********************************************************************/
2095
0
        {
2096
0
            WORD32 b, c, d;
2097
0
            UWORD8 *pu1_cur_ptr;
2098
0
            UWORD8 *pu1_ref_ptr;
2099
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2100
2101
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2102
0
            {
2103
0
                continue;
2104
0
            }
2105
2106
0
            ps_err_prms->pu1_inp =
2107
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2108
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2109
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2110
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2111
2112
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2113
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2114
2115
            /* Loop to compute the SAD's */
2116
0
            {
2117
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2118
0
                for(b = 0; b < NUM_4X4; b++)
2119
0
                {
2120
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2121
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2122
2123
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2124
0
                    {
2125
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2126
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2127
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2128
0
                        {
2129
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2130
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2131
0
                        }
2132
0
                    }
2133
0
                }
2134
2135
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2136
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2137
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2138
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2139
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2140
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2141
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2142
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2143
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2144
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2145
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2146
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2147
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2148
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2149
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2150
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2151
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2152
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2153
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2154
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2155
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2156
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2157
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2158
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2159
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2160
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2161
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2162
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2163
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2164
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2165
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2166
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2167
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2168
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2169
0
            }
2170
0
        }
2171
2172
0
        {
2173
0
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2174
0
            S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2175
0
            S32 best_node_cost;
2176
0
            S32 second_best_node_cost;
2177
2178
0
            {
2179
0
                S16 mvdx1, mvdy1;
2180
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2181
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2182
0
                S32 pred_lx = i4_search_idx;
2183
2184
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2185
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2186
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2187
2188
0
                S32 inp_shift = 2;
2189
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2190
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2191
0
                S32 lambda = ps_pred_ctxt->lambda;
2192
0
                S32 rnd = 1 << (lambda_q_shift - 1);
2193
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2194
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2195
0
                S32 ref_bits =
2196
0
                    ps_pred_ctxt
2197
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2198
2199
0
                COMPUTE_DIFF_MV(
2200
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2201
2202
0
                mvdx1 = ABS(mvdx1);
2203
0
                mvdy1 = ABS(mvdy1);
2204
2205
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2206
0
                             (mvdy1 > 0) + ref_bits + 2;
2207
2208
0
                i4_mv_cost *= lambda;
2209
0
                i4_mv_cost += rnd;
2210
0
                i4_mv_cost >>= lambda_q_shift;
2211
2212
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2213
0
            }
2214
2215
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2216
            best candidates for that partition*/
2217
2218
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2219
0
            {
2220
0
                S32 update_required = 0;
2221
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2222
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2223
2224
                /*Calculate total cost*/
2225
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2226
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2227
2228
                /*****************************************************************/
2229
                /* We do not labor through the results if the total cost worse   */
2230
                /* than the last of the results.                                 */
2231
                /*****************************************************************/
2232
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2233
0
                second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2234
2235
0
                if(i4_tot_cost < second_best_node_cost)
2236
0
                {
2237
0
                    update_required = 2;
2238
2239
                    /*************************************************************/
2240
                    /* Identify where the current result isto be placed.Basically*/
2241
                    /* find the node which has cost just higher thannodeundertest*/
2242
                    /*************************************************************/
2243
0
                    if(i4_tot_cost < best_node_cost)
2244
0
                    {
2245
0
                        update_required = 1;
2246
0
                    }
2247
0
                    else if(i4_tot_cost == best_node_cost)
2248
0
                    {
2249
0
                        update_required = 0;
2250
0
                    }
2251
2252
0
                    if(update_required == 2)
2253
0
                    {
2254
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2255
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2256
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2257
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2258
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2259
0
                    }
2260
0
                    else if(update_required == 1)
2261
0
                    {
2262
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2263
0
                            ps_mv_refine_ctxt->i2_tot_cost[0][index];
2264
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2265
0
                            ps_mv_refine_ctxt->i2_mv_cost[0][index];
2266
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2267
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2268
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2269
0
                            ps_mv_refine_ctxt->i2_ref_idx[0][index];
2270
2271
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2272
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2273
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2274
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2275
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2276
0
                    }
2277
0
                }
2278
0
            }
2279
0
        }
2280
0
        ps_search_node++;
2281
0
    }
2282
2283
0
    {
2284
0
        WORD32 i4_i;
2285
0
        WORD32 part_id;
2286
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2287
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2288
0
        {
2289
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2290
0
            if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2291
0
            {
2292
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2293
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2294
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2295
2296
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2297
0
            }
2298
0
            if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2299
0
            {
2300
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2301
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2302
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2303
2304
0
                ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2305
0
            }
2306
0
        }
2307
0
    }
2308
0
}
2309
2310
void hme_calc_sad_and_2_best_results_subpel(
2311
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2312
0
{
2313
0
    S32 i4_candt;
2314
0
    S32 i4_num_nodes;
2315
2316
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2317
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2318
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2319
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2320
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2321
2322
0
    mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2323
0
    ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2324
0
    i4_num_nodes = 1;
2325
2326
    /* Run through each of the candts in a loop */
2327
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2328
0
    {
2329
        /**********************************************************************/
2330
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2331
        /**********************************************************************/
2332
0
        {
2333
0
            WORD32 b, c, d;
2334
0
            UWORD8 *pu1_cur_ptr;
2335
0
            UWORD8 *pu1_ref_ptr;
2336
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2337
2338
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2339
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2340
2341
            /* Loop to compute the SAD's */
2342
0
            {
2343
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2344
0
                for(b = 0; b < NUM_4X4; b++)
2345
0
                {
2346
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2347
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2348
2349
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2350
0
                    {
2351
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2352
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2353
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2354
0
                        {
2355
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2356
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2357
0
                        }
2358
0
                    }
2359
0
                }
2360
2361
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2362
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2363
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2364
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2365
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2366
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2367
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2368
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2369
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2370
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2371
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2372
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2373
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2374
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2375
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2376
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2377
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2378
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2379
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2380
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2381
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2382
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2383
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2384
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2385
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2386
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2387
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2388
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2389
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2390
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2391
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2392
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2393
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2394
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2395
0
            }
2396
0
        }
2397
        /**********************************************************************/
2398
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
2399
        /**********************************************************************/
2400
0
        {
2401
0
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2402
0
            S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2403
0
            S32 best_node_cost;
2404
0
            S32 second_best_node_cost;
2405
2406
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2407
            best candidates for that partition*/
2408
2409
0
            for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2410
0
            {
2411
0
                S32 update_required = 0;
2412
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2413
0
                S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2414
2415
                /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2416
0
                i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2417
2418
                /*Calculate total cost*/
2419
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2420
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2421
2422
                /*****************************************************************/
2423
                /* We do not labor through the results if the total cost worse   */
2424
                /* than the last of the results.                                 */
2425
                /*****************************************************************/
2426
0
                best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2427
0
                second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2428
2429
0
                if(i4_tot_cost < second_best_node_cost)
2430
0
                {
2431
0
                    update_required = 2;
2432
2433
                    /*************************************************************/
2434
                    /* Identify where the current result isto be placed.Basically*/
2435
                    /* find the node which has cost just higher thannodeundertest*/
2436
                    /*************************************************************/
2437
0
                    if(i4_tot_cost < best_node_cost)
2438
0
                    {
2439
0
                        update_required = 1;
2440
0
                    }
2441
0
                    else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2442
0
                    {
2443
0
                        update_required = 0;
2444
0
                    }
2445
0
                    if(update_required == 2)
2446
0
                    {
2447
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2448
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2449
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2450
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2451
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2452
0
                    }
2453
0
                    else if(update_required == 1)
2454
0
                    {
2455
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2456
0
                            ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2457
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2458
0
                            ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2459
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2460
0
                            ps_subpel_refine_ctxt->i2_mv_x[0][index];
2461
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2462
0
                            ps_subpel_refine_ctxt->i2_mv_y[0][index];
2463
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2464
0
                            ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2465
2466
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2467
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2468
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2469
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2470
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2471
0
                    }
2472
0
                }
2473
0
            }
2474
0
        }
2475
0
    }
2476
2477
0
    {
2478
0
        WORD32 i4_count = 0;
2479
0
        for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2480
0
        {
2481
0
            WORD32 j;
2482
0
            for(j = 0; j < 2; j++)
2483
0
            {
2484
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2485
0
                {
2486
0
                    ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2487
0
                }
2488
0
            }
2489
0
        }
2490
0
    }
2491
0
}
2492
2493
void hme_calc_stim_injected_sad_and_2_best_results(
2494
    hme_search_prms_t *ps_search_prms,
2495
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2496
    err_prms_t *ps_err_prms,
2497
    result_upd_prms_t *ps_result_prms,
2498
    U08 **ppu1_ref,
2499
    S32 i4_ref_stride)
2500
0
{
2501
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2502
0
    search_node_t *ps_search_node;
2503
2504
0
    S32 i4_candt;
2505
0
    S32 i4_count;
2506
0
    S32 i4_inp_off;
2507
0
    S32 i4_ref_offset;
2508
0
    S32 i4_num_nodes;
2509
0
    ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2510
0
        au8_final_ref_sigmaXSquared[17];
2511
0
    UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2512
0
    S32 *pi4_valid_part_ids;
2513
2514
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2515
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2516
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2517
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2518
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2519
2520
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2521
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2522
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2523
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2524
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2525
0
    ps_search_node = ps_search_prms->ps_search_nodes;
2526
0
    pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2527
2528
    /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2529
0
    au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2530
0
    au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2531
2532
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2533
0
    {
2534
0
        {
2535
0
            WORD32 b, c, d;
2536
0
            UWORD8 *pu1_cur_ptr;
2537
0
            UWORD8 *pu1_ref_ptr;
2538
0
            UWORD16 au2_4x4_sad[NUM_4X4];
2539
2540
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2541
0
            {
2542
0
                continue;
2543
0
            }
2544
2545
0
            ps_err_prms->pu1_inp =
2546
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2547
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2548
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2549
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2550
2551
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2552
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2553
2554
            /* Loop to compute the SAD's */
2555
0
            {
2556
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2557
0
                for(b = 0; b < NUM_4X4; b++)
2558
0
                {
2559
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2560
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2561
2562
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2563
0
                    {
2564
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2565
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2566
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2567
0
                        {
2568
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
2569
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2570
0
                        }
2571
0
                    }
2572
0
                }
2573
2574
                /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2575
0
                hme_compute_sigmaX_and_sigmaXSquared(
2576
0
                    pu1_ref_ptr,
2577
0
                    ref_buf_stride,
2578
0
                    au4_4x4_ref_sigmaX,
2579
0
                    au4_4x4_ref_sigmaXSquared,
2580
0
                    4,
2581
0
                    4,
2582
0
                    16,
2583
0
                    16,
2584
0
                    1,
2585
0
                    4);
2586
2587
0
                pi4_sad_grid[PART_ID_NxN_TL] =
2588
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2589
0
                pi4_sad_grid[PART_ID_NxN_TR] =
2590
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2591
0
                pi4_sad_grid[PART_ID_NxN_BL] =
2592
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2593
0
                pi4_sad_grid[PART_ID_NxN_BR] =
2594
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2595
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
2596
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2597
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
2598
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2599
0
                pi4_sad_grid[PART_ID_2NxN_T] =
2600
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2601
0
                pi4_sad_grid[PART_ID_2NxN_B] =
2602
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2603
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
2604
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2605
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
2606
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2607
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
2608
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2609
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
2610
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2611
0
                pi4_sad_grid[PART_ID_2Nx2N] =
2612
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2613
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
2614
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2615
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
2616
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2617
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
2618
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2619
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
2620
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2621
0
            }
2622
0
        }
2623
2624
0
        {
2625
0
            S32 i4_sad, i4_mv_cost, i4_tot_cost;
2626
0
            S32 best_node_cost;
2627
0
            S32 second_best_node_cost;
2628
0
            ULWORD64 u8_temp_var, u8_temp_var1;
2629
0
            ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2630
2631
0
            {
2632
0
                S16 mvdx1, mvdy1;
2633
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2634
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2635
0
                S32 pred_lx = i4_search_idx;
2636
2637
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2638
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2639
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2640
2641
0
                S32 inp_shift = 2;
2642
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2643
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2644
0
                S32 lambda = ps_pred_ctxt->lambda;
2645
0
                S32 rnd = 1 << (lambda_q_shift - 1);
2646
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2647
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2648
0
                S32 ref_bits =
2649
0
                    ps_pred_ctxt
2650
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2651
2652
0
                COMPUTE_DIFF_MV(
2653
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2654
2655
0
                mvdx1 = ABS(mvdx1);
2656
0
                mvdy1 = ABS(mvdy1);
2657
2658
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2659
0
                             (mvdy1 > 0) + ref_bits + 2;
2660
2661
0
                i4_mv_cost *= lambda;
2662
0
                i4_mv_cost += rnd;
2663
0
                i4_mv_cost >>= lambda_q_shift;
2664
2665
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2666
0
            }
2667
2668
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2669
0
            {
2670
0
                S32 i4_stim_injected_sad;
2671
0
                S32 i4_stim_injected_cost;
2672
0
                S32 i4_noise_term;
2673
0
                unsigned long u4_shift_val;
2674
0
                S32 i4_bits_req;
2675
2676
0
                S32 update_required = 0;
2677
0
                S32 part_id = pi4_valid_part_ids[i4_count];
2678
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2679
2680
0
                WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2681
2682
0
                S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2683
2684
0
                if(ps_search_prms->i4_alpha_stim_multiplier)
2685
0
                {
2686
                    /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2687
0
                    hme_compute_final_sigma_of_pu_from_base_blocks(
2688
0
                        au4_4x4_ref_sigmaX,
2689
0
                        au4_4x4_ref_sigmaXSquared,
2690
0
                        au8_final_ref_sigmaX,
2691
0
                        au8_final_ref_sigmaXSquared,
2692
0
                        16,
2693
0
                        4,
2694
0
                        part_id,
2695
0
                        4);
2696
2697
0
                    u8_ref_X_Square =
2698
0
                        (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2699
0
                    u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2700
2701
                    /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2702
                    /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2703
                    /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2704
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
2705
0
                        au8_final_src_sigmaX,
2706
0
                        au8_final_src_sigmaXSquared,
2707
0
                        &u8_src_var,
2708
0
                        i4_inv_wt,
2709
0
                        ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2710
0
                        ps_wt_inp_prms->wpred_log_wdc,
2711
0
                        part_id);
2712
2713
0
                    u8_ref_var = u8_ref_var >> u4_shift_val;
2714
2715
                    /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2716
0
                    GETRANGE64(i4_bits_req, u8_ref_var);
2717
2718
0
                    if(i4_bits_req > 27)
2719
0
                    {
2720
0
                        u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2721
0
                        u8_src_var = u8_src_var >> (i4_bits_req - 27);
2722
0
                    }
2723
2724
0
                    if(u8_src_var == u8_ref_var)
2725
0
                    {
2726
0
                        u8_temp_var = (1 << STIM_Q_FORMAT);
2727
0
                    }
2728
0
                    else
2729
0
                    {
2730
0
                        u8_temp_var = (2 * u8_src_var * u8_ref_var);
2731
0
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2732
0
                        u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2733
0
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2734
0
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
2735
0
                    }
2736
2737
0
                    i4_noise_term = (UWORD32)u8_temp_var;
2738
2739
0
                    ASSERT(i4_noise_term >= 0);
2740
2741
0
                    i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2742
0
                }
2743
0
                else
2744
0
                {
2745
0
                    i4_noise_term = 0;
2746
0
                }
2747
0
                u8_pure_dist = pi4_sad_grid[part_id];
2748
0
                u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2749
0
                u8_pure_dist += (1 << ((i4_q_level)-1));
2750
0
                i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2751
2752
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2753
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2754
0
                i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2755
0
                i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2756
2757
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2758
0
                second_best_node_cost =
2759
0
                    CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2760
2761
0
                if(i4_stim_injected_cost < second_best_node_cost)
2762
0
                {
2763
0
                    update_required = 2;
2764
2765
0
                    if(i4_stim_injected_cost < best_node_cost)
2766
0
                    {
2767
0
                        update_required = 1;
2768
0
                    }
2769
0
                    else if(i4_stim_injected_cost == best_node_cost)
2770
0
                    {
2771
0
                        update_required = 0;
2772
0
                    }
2773
2774
0
                    if(update_required == 2)
2775
0
                    {
2776
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2777
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2778
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2779
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2780
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2781
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2782
0
                    }
2783
0
                    else if(update_required == 1)
2784
0
                    {
2785
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2786
0
                            ps_mv_refine_ctxt->i2_tot_cost[0][index];
2787
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2788
0
                            ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2789
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2790
0
                            ps_mv_refine_ctxt->i2_mv_cost[0][index];
2791
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2792
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2793
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2794
0
                            ps_mv_refine_ctxt->i2_ref_idx[0][index];
2795
2796
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2797
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2798
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2799
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2800
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2801
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2802
0
                    }
2803
0
                }
2804
0
            }
2805
0
        }
2806
2807
0
        ps_search_node++;
2808
0
    }
2809
2810
0
    {
2811
0
        WORD32 i4_i;
2812
0
        WORD32 part_id;
2813
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2814
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2815
0
        {
2816
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2817
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2818
0
            {
2819
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2820
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2821
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2822
2823
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2824
0
            }
2825
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2826
0
            {
2827
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2828
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2829
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2830
2831
0
                ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2832
0
            }
2833
0
        }
2834
0
    }
2835
0
}
2836
2837
void hme_calc_sad_and_1_best_result(
2838
    hme_search_prms_t *ps_search_prms,
2839
    wgt_pred_ctxt_t *ps_wt_inp_prms,
2840
    err_prms_t *ps_err_prms,
2841
    result_upd_prms_t *ps_result_prms,
2842
    U08 **ppu1_ref,
2843
    S32 i4_ref_stride)
2844
659k
{
2845
659k
    S32 i4_candt;
2846
659k
    S32 i4_inp_off;
2847
659k
    S32 i4_ref_offset;
2848
659k
    S32 i4_num_nodes;
2849
2850
659k
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2851
659k
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2852
659k
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2853
659k
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2854
659k
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2855
2856
659k
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
2857
659k
    search_node_t *ps_search_node;
2858
2859
659k
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2860
659k
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2861
659k
    i4_inp_off = ps_search_prms->i4_cu_x_off;
2862
659k
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2863
659k
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2864
659k
    ps_search_node = ps_search_prms->ps_search_nodes;
2865
2866
6.09M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2867
5.43M
    {
2868
        /**********************************************************************/
2869
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2870
        /**********************************************************************/
2871
5.43M
        {
2872
5.43M
            WORD32 b, c, d;
2873
5.43M
            UWORD8 *pu1_cur_ptr;
2874
5.43M
            UWORD8 *pu1_ref_ptr;
2875
5.43M
            UWORD16 au2_4x4_sad[NUM_4X4];
2876
2877
5.43M
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2878
0
            {
2879
0
                continue;
2880
0
            }
2881
2882
5.43M
            ps_err_prms->pu1_inp =
2883
5.43M
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2884
5.43M
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2885
5.43M
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2886
5.43M
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2887
2888
5.43M
            pu1_cur_ptr = ps_err_prms->pu1_inp;
2889
5.43M
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2890
2891
            /* Loop to compute the SAD's */
2892
5.43M
            {
2893
5.43M
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2894
92.4M
                for(b = 0; b < NUM_4X4; b++)
2895
87.0M
                {
2896
87.0M
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2897
87.0M
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2898
2899
435M
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2900
348M
                    {
2901
348M
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
2902
348M
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
2903
1.74G
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2904
1.39G
                        {
2905
1.39G
                            au2_4x4_sad[b] += (UWORD16)ABS((
2906
1.39G
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2907
1.39G
                        }
2908
348M
                    }
2909
87.0M
                }
2910
2911
5.43M
                pi4_sad_grid[PART_ID_NxN_TL] =
2912
5.43M
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2913
5.43M
                pi4_sad_grid[PART_ID_NxN_TR] =
2914
5.43M
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2915
5.43M
                pi4_sad_grid[PART_ID_NxN_BL] =
2916
5.43M
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2917
5.43M
                pi4_sad_grid[PART_ID_NxN_BR] =
2918
5.43M
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2919
5.43M
                pi4_sad_grid[PART_ID_Nx2N_L] =
2920
5.43M
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2921
5.43M
                pi4_sad_grid[PART_ID_Nx2N_R] =
2922
5.43M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2923
5.43M
                pi4_sad_grid[PART_ID_2NxN_T] =
2924
5.43M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2925
5.43M
                pi4_sad_grid[PART_ID_2NxN_B] =
2926
5.43M
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2927
5.43M
                pi4_sad_grid[PART_ID_nLx2N_L] =
2928
5.43M
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2929
5.43M
                pi4_sad_grid[PART_ID_nRx2N_R] =
2930
5.43M
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2931
5.43M
                pi4_sad_grid[PART_ID_2NxnU_T] =
2932
5.43M
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2933
5.43M
                pi4_sad_grid[PART_ID_2NxnD_B] =
2934
5.43M
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2935
5.43M
                pi4_sad_grid[PART_ID_2Nx2N] =
2936
5.43M
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2937
5.43M
                pi4_sad_grid[PART_ID_2NxnU_B] =
2938
5.43M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2939
5.43M
                pi4_sad_grid[PART_ID_2NxnD_T] =
2940
5.43M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2941
5.43M
                pi4_sad_grid[PART_ID_nRx2N_L] =
2942
5.43M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2943
5.43M
                pi4_sad_grid[PART_ID_nLx2N_R] =
2944
5.43M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2945
5.43M
            }
2946
5.43M
        }
2947
2948
0
        {
2949
5.43M
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2950
5.43M
            S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2951
5.43M
            S32 best_node_cost;
2952
5.43M
            S32 second_best_node_cost;
2953
2954
5.43M
            {
2955
5.43M
                S16 mvdx1, mvdy1;
2956
5.43M
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2957
5.43M
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2958
5.43M
                S32 pred_lx = i4_search_idx;
2959
2960
5.43M
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2961
5.43M
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2962
5.43M
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2963
2964
5.43M
                S32 inp_shift = 2;
2965
5.43M
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2966
5.43M
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2967
5.43M
                S32 lambda = ps_pred_ctxt->lambda;
2968
5.43M
                S32 rnd = 1 << (lambda_q_shift - 1);
2969
5.43M
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2970
5.43M
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2971
5.43M
                S32 ref_bits =
2972
5.43M
                    ps_pred_ctxt
2973
5.43M
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2974
2975
5.43M
                COMPUTE_DIFF_MV(
2976
5.43M
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2977
2978
5.43M
                mvdx1 = ABS(mvdx1);
2979
5.43M
                mvdy1 = ABS(mvdy1);
2980
2981
5.43M
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2982
5.43M
                             (mvdy1 > 0) + ref_bits + 2;
2983
2984
5.43M
                i4_mv_cost *= lambda;
2985
5.43M
                i4_mv_cost += rnd;
2986
5.43M
                i4_mv_cost >>= lambda_q_shift;
2987
2988
5.43M
                i4_mv_cost = CLIP_U16(i4_mv_cost);
2989
5.43M
            }
2990
2991
            /*For each valid partition, update the refine_prm structure to reflect the best and second
2992
            best candidates for that partition*/
2993
2994
57.8M
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2995
52.4M
            {
2996
52.4M
                S32 update_required = 0;
2997
52.4M
                S32 part_id = pi4_valid_part_ids[i4_count];
2998
52.4M
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2999
3000
                /*Calculate total cost*/
3001
52.4M
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3002
52.4M
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3003
3004
                /*****************************************************************/
3005
                /* We do not labor through the results if the total cost worse   */
3006
                /* than the last of the results.                                 */
3007
                /*****************************************************************/
3008
52.4M
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3009
52.4M
                second_best_node_cost = SHRT_MAX;
3010
3011
52.4M
                if(i4_tot_cost < second_best_node_cost)
3012
52.2M
                {
3013
52.2M
                    update_required = 0;
3014
3015
                    /*************************************************************/
3016
                    /* Identify where the current result isto be placed.Basically*/
3017
                    /* find the node which has cost just higher thannodeundertest*/
3018
                    /*************************************************************/
3019
52.2M
                    if(i4_tot_cost < best_node_cost)
3020
6.73M
                    {
3021
6.73M
                        update_required = 1;
3022
6.73M
                    }
3023
45.5M
                    else if(i4_tot_cost == best_node_cost)
3024
2.82M
                    {
3025
2.82M
                        update_required = 0;
3026
2.82M
                    }
3027
3028
52.2M
                    if(update_required == 2)
3029
0
                    {
3030
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3031
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3032
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3033
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3034
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3035
0
                    }
3036
52.2M
                    else if(update_required == 1)
3037
6.73M
                    {
3038
6.73M
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3039
6.73M
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3040
6.73M
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3041
6.73M
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3042
6.73M
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3043
6.73M
                    }
3044
52.2M
                }
3045
52.4M
            }
3046
5.43M
        }
3047
5.43M
        ps_search_node++;
3048
5.43M
    }
3049
3050
659k
    {
3051
659k
        WORD32 i4_i;
3052
659k
        WORD32 part_id;
3053
659k
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3054
6.14M
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3055
5.48M
        {
3056
5.48M
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3057
5.48M
            if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3058
267k
            {
3059
267k
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3060
267k
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3061
267k
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3062
3063
267k
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3064
267k
            }
3065
5.48M
        }
3066
659k
    }
3067
659k
}
3068
3069
void hme_calc_stim_injected_sad_and_1_best_result(
3070
    hme_search_prms_t *ps_search_prms,
3071
    wgt_pred_ctxt_t *ps_wt_inp_prms,
3072
    err_prms_t *ps_err_prms,
3073
    result_upd_prms_t *ps_result_prms,
3074
    U08 **ppu1_ref,
3075
    S32 i4_ref_stride)
3076
0
{
3077
0
    mv_refine_ctxt_t *ps_mv_refine_ctxt;
3078
0
    search_node_t *ps_search_node;
3079
3080
0
    S32 i4_candt;
3081
0
    S32 i4_count;
3082
0
    S32 i4_inp_off;
3083
0
    S32 i4_ref_offset;
3084
0
    S32 i4_num_nodes;
3085
0
    ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3086
0
        au8_final_ref_sigmaXSquared[17];
3087
0
    UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3088
0
    S32 *pi4_valid_part_ids;
3089
3090
0
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3091
0
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3092
0
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3093
0
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3094
0
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3095
3096
0
    ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3097
0
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3098
0
    i4_inp_off = ps_search_prms->i4_cu_x_off;
3099
0
    i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3100
0
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3101
0
    ps_search_node = ps_search_prms->ps_search_nodes;
3102
0
    pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3103
3104
    /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3105
0
    au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3106
0
    au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3107
3108
0
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3109
0
    {
3110
0
        {
3111
0
            WORD32 b, c, d;
3112
0
            UWORD8 *pu1_cur_ptr;
3113
0
            UWORD8 *pu1_ref_ptr;
3114
0
            UWORD16 au2_4x4_sad[NUM_4X4];
3115
3116
0
            if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3117
0
            {
3118
0
                continue;
3119
0
            }
3120
3121
0
            ps_err_prms->pu1_inp =
3122
0
                ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3123
0
            ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3124
0
            ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3125
0
            ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3126
3127
0
            pu1_cur_ptr = ps_err_prms->pu1_inp;
3128
0
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3129
3130
            /* Loop to compute the SAD's */
3131
0
            {
3132
0
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3133
0
                for(b = 0; b < NUM_4X4; b++)
3134
0
                {
3135
0
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3136
0
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3137
3138
0
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3139
0
                    {
3140
0
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
3141
0
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
3142
0
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3143
0
                        {
3144
0
                            au2_4x4_sad[b] += (UWORD16)ABS((
3145
0
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3146
0
                        }
3147
0
                    }
3148
0
                }
3149
3150
                /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3151
0
                hme_compute_sigmaX_and_sigmaXSquared(
3152
0
                    pu1_ref_ptr,
3153
0
                    ref_buf_stride,
3154
0
                    au4_4x4_ref_sigmaX,
3155
0
                    au4_4x4_ref_sigmaXSquared,
3156
0
                    4,
3157
0
                    4,
3158
0
                    16,
3159
0
                    16,
3160
0
                    1,
3161
0
                    4);
3162
3163
0
                pi4_sad_grid[PART_ID_NxN_TL] =
3164
0
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3165
0
                pi4_sad_grid[PART_ID_NxN_TR] =
3166
0
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3167
0
                pi4_sad_grid[PART_ID_NxN_BL] =
3168
0
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3169
0
                pi4_sad_grid[PART_ID_NxN_BR] =
3170
0
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3171
0
                pi4_sad_grid[PART_ID_Nx2N_L] =
3172
0
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3173
0
                pi4_sad_grid[PART_ID_Nx2N_R] =
3174
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3175
0
                pi4_sad_grid[PART_ID_2NxN_T] =
3176
0
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3177
0
                pi4_sad_grid[PART_ID_2NxN_B] =
3178
0
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3179
0
                pi4_sad_grid[PART_ID_nLx2N_L] =
3180
0
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3181
0
                pi4_sad_grid[PART_ID_nRx2N_R] =
3182
0
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3183
0
                pi4_sad_grid[PART_ID_2NxnU_T] =
3184
0
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3185
0
                pi4_sad_grid[PART_ID_2NxnD_B] =
3186
0
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3187
0
                pi4_sad_grid[PART_ID_2Nx2N] =
3188
0
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3189
0
                pi4_sad_grid[PART_ID_2NxnU_B] =
3190
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3191
0
                pi4_sad_grid[PART_ID_2NxnD_T] =
3192
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3193
0
                pi4_sad_grid[PART_ID_nRx2N_L] =
3194
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3195
0
                pi4_sad_grid[PART_ID_nLx2N_R] =
3196
0
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3197
0
            }
3198
0
        }
3199
3200
0
        {
3201
0
            S32 i4_sad, i4_mv_cost, i4_tot_cost;
3202
0
            S32 best_node_cost;
3203
0
            S32 second_best_node_cost;
3204
0
            ULWORD64 u8_temp_var, u8_temp_var1;
3205
0
            ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3206
3207
0
            {
3208
0
                S16 mvdx1, mvdy1;
3209
0
                S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3210
0
                search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3211
0
                S32 pred_lx = i4_search_idx;
3212
3213
0
                pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3214
0
                pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3215
0
                search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3216
3217
0
                S32 inp_shift = 2;
3218
0
                S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3219
0
                S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3220
0
                S32 lambda = ps_pred_ctxt->lambda;
3221
0
                S32 rnd = 1 << (lambda_q_shift - 1);
3222
0
                S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3223
0
                S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3224
0
                S32 ref_bits =
3225
0
                    ps_pred_ctxt
3226
0
                        ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3227
3228
0
                COMPUTE_DIFF_MV(
3229
0
                    mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3230
3231
0
                mvdx1 = ABS(mvdx1);
3232
0
                mvdy1 = ABS(mvdy1);
3233
3234
0
                i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3235
0
                             (mvdy1 > 0) + ref_bits + 2;
3236
3237
0
                i4_mv_cost *= lambda;
3238
0
                i4_mv_cost += rnd;
3239
0
                i4_mv_cost >>= lambda_q_shift;
3240
3241
0
                i4_mv_cost = CLIP_U16(i4_mv_cost);
3242
0
            }
3243
3244
0
            for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3245
0
            {
3246
0
                S32 i4_stim_injected_sad;
3247
0
                S32 i4_stim_injected_cost;
3248
0
                S32 i4_noise_term;
3249
0
                unsigned long u4_shift_val;
3250
0
                S32 i4_bits_req;
3251
3252
0
                S32 update_required = 0;
3253
0
                S32 part_id = pi4_valid_part_ids[i4_count];
3254
0
                S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3255
3256
0
                WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3257
3258
0
                S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3259
3260
0
                if(ps_search_prms->i4_alpha_stim_multiplier)
3261
0
                {
3262
                    /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3263
0
                    hme_compute_final_sigma_of_pu_from_base_blocks(
3264
0
                        au4_4x4_ref_sigmaX,
3265
0
                        au4_4x4_ref_sigmaXSquared,
3266
0
                        au8_final_ref_sigmaX,
3267
0
                        au8_final_ref_sigmaXSquared,
3268
0
                        16,
3269
0
                        4,
3270
0
                        part_id,
3271
0
                        4);
3272
3273
0
                    u8_ref_X_Square =
3274
0
                        (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3275
0
                    u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3276
3277
                    /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3278
                    /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3279
                    /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3280
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
3281
0
                        au8_final_src_sigmaX,
3282
0
                        au8_final_src_sigmaXSquared,
3283
0
                        &u8_src_var,
3284
0
                        i4_inv_wt,
3285
0
                        ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3286
0
                        ps_wt_inp_prms->wpred_log_wdc,
3287
0
                        part_id);
3288
3289
0
                    u8_ref_var = u8_ref_var >> u4_shift_val;
3290
3291
                    /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3292
0
                    GETRANGE64(i4_bits_req, u8_ref_var);
3293
3294
0
                    if(i4_bits_req > 27)
3295
0
                    {
3296
0
                        u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3297
0
                        u8_src_var = u8_src_var >> (i4_bits_req - 27);
3298
0
                    }
3299
3300
0
                    if(u8_src_var == u8_ref_var)
3301
0
                    {
3302
0
                        u8_temp_var = (1 << STIM_Q_FORMAT);
3303
0
                    }
3304
0
                    else
3305
0
                    {
3306
0
                        u8_temp_var = (2 * u8_src_var * u8_ref_var);
3307
0
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3308
0
                        u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3309
0
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3310
0
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
3311
0
                    }
3312
3313
0
                    i4_noise_term = (UWORD32)u8_temp_var;
3314
3315
0
                    ASSERT(i4_noise_term >= 0);
3316
3317
0
                    i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3318
0
                }
3319
0
                else
3320
0
                {
3321
0
                    i4_noise_term = 0;
3322
0
                }
3323
0
                u8_pure_dist = pi4_sad_grid[part_id];
3324
0
                u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3325
0
                u8_pure_dist += (1 << ((i4_q_level)-1));
3326
0
                i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3327
3328
0
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3329
0
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3330
0
                i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3331
0
                i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3332
3333
0
                best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3334
0
                second_best_node_cost = SHRT_MAX;
3335
3336
0
                if(i4_stim_injected_cost < second_best_node_cost)
3337
0
                {
3338
0
                    update_required = 0;
3339
3340
0
                    if(i4_stim_injected_cost < best_node_cost)
3341
0
                    {
3342
0
                        update_required = 1;
3343
0
                    }
3344
0
                    else if(i4_stim_injected_cost == best_node_cost)
3345
0
                    {
3346
0
                        update_required = 0;
3347
0
                    }
3348
3349
0
                    if(update_required == 2)
3350
0
                    {
3351
0
                        ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3352
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3353
0
                        ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3354
0
                        ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3355
0
                        ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3356
0
                        ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3357
0
                    }
3358
0
                    else if(update_required == 1)
3359
0
                    {
3360
0
                        ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3361
0
                        ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3362
0
                        ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3363
0
                        ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3364
0
                        ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3365
0
                        ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3366
0
                    }
3367
0
                }
3368
0
            }
3369
0
        }
3370
3371
0
        ps_search_node++;
3372
0
    }
3373
3374
0
    {
3375
0
        WORD32 i4_i;
3376
0
        WORD32 part_id;
3377
0
        search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3378
0
        for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3379
0
        {
3380
0
            part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3381
0
            if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3382
0
            {
3383
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3384
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3385
0
                ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3386
3387
0
                ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3388
0
            }
3389
0
        }
3390
0
    }
3391
0
}
3392
3393
void hme_calc_sad_and_1_best_result_subpel(
3394
    err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3395
1.73M
{
3396
1.73M
    S32 i4_candt;
3397
1.73M
    S32 i4_num_nodes;
3398
3399
1.73M
    S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3400
3401
1.73M
    S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3402
1.73M
    WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3403
1.73M
    WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3404
1.73M
    WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3405
3406
1.73M
    mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3407
1.73M
    ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3408
1.73M
    i4_num_nodes = 1;
3409
3410
    /* Run through each of the candts in a loop */
3411
3.47M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3412
1.73M
    {
3413
        /**********************************************************************/
3414
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3415
        /**********************************************************************/
3416
1.73M
        {
3417
1.73M
            WORD32 b, c, d;
3418
1.73M
            UWORD8 *pu1_cur_ptr;
3419
1.73M
            UWORD8 *pu1_ref_ptr;
3420
1.73M
            UWORD16 au2_4x4_sad[NUM_4X4];
3421
3422
1.73M
            pu1_cur_ptr = ps_err_prms->pu1_inp;
3423
1.73M
            pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3424
3425
            /* Loop to compute the SAD's */
3426
1.73M
            {
3427
1.73M
                memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3428
29.5M
                for(b = 0; b < NUM_4X4; b++)
3429
27.8M
                {
3430
27.8M
                    WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3431
27.8M
                    WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3432
3433
139M
                    for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3434
111M
                    {
3435
111M
                        WORD32 z_cur = (cur_buf_stride)*c + t1;
3436
111M
                        WORD32 z_ref = (ref_buf_stride)*c + t2;
3437
556M
                        for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3438
445M
                        {
3439
445M
                            au2_4x4_sad[b] += (UWORD16)ABS((
3440
445M
                                ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3441
445M
                        }
3442
111M
                    }
3443
27.8M
                }
3444
3445
1.73M
                pi4_sad_grid[PART_ID_NxN_TL] =
3446
1.73M
                    (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3447
1.73M
                pi4_sad_grid[PART_ID_NxN_TR] =
3448
1.73M
                    (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3449
1.73M
                pi4_sad_grid[PART_ID_NxN_BL] =
3450
1.73M
                    (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3451
1.73M
                pi4_sad_grid[PART_ID_NxN_BR] =
3452
1.73M
                    (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3453
1.73M
                pi4_sad_grid[PART_ID_Nx2N_L] =
3454
1.73M
                    pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3455
1.73M
                pi4_sad_grid[PART_ID_Nx2N_R] =
3456
1.73M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3457
1.73M
                pi4_sad_grid[PART_ID_2NxN_T] =
3458
1.73M
                    pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3459
1.73M
                pi4_sad_grid[PART_ID_2NxN_B] =
3460
1.73M
                    pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3461
1.73M
                pi4_sad_grid[PART_ID_nLx2N_L] =
3462
1.73M
                    (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3463
1.73M
                pi4_sad_grid[PART_ID_nRx2N_R] =
3464
1.73M
                    (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3465
1.73M
                pi4_sad_grid[PART_ID_2NxnU_T] =
3466
1.73M
                    (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3467
1.73M
                pi4_sad_grid[PART_ID_2NxnD_B] =
3468
1.73M
                    (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3469
1.73M
                pi4_sad_grid[PART_ID_2Nx2N] =
3470
1.73M
                    pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3471
1.73M
                pi4_sad_grid[PART_ID_2NxnU_B] =
3472
1.73M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3473
1.73M
                pi4_sad_grid[PART_ID_2NxnD_T] =
3474
1.73M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3475
1.73M
                pi4_sad_grid[PART_ID_nRx2N_L] =
3476
1.73M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3477
1.73M
                pi4_sad_grid[PART_ID_nLx2N_R] =
3478
1.73M
                    pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3479
1.73M
            }
3480
1.73M
        }
3481
        /**********************************************************************/
3482
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3483
        /**********************************************************************/
3484
1.73M
        {
3485
1.73M
            S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3486
1.73M
            S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3487
1.73M
            S32 best_node_cost;
3488
1.73M
            S32 second_best_node_cost;
3489
3490
            /*For each valid partition, update the refine_prm structure to reflect the best and second
3491
            best candidates for that partition*/
3492
3493
6.46M
            for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3494
4.72M
            {
3495
4.72M
                S32 update_required = 0;
3496
4.72M
                S32 part_id = pi4_valid_part_ids[i4_count];
3497
4.72M
                S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3498
3499
                /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3500
4.72M
                i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3501
3502
                /*Calculate total cost*/
3503
4.72M
                i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3504
4.72M
                i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3505
3506
                /*****************************************************************/
3507
                /* We do not labor through the results if the total cost worse   */
3508
                /* than the last of the results.                                 */
3509
                /*****************************************************************/
3510
4.72M
                best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3511
4.72M
                second_best_node_cost = SHRT_MAX;
3512
3513
4.72M
                if(i4_tot_cost < second_best_node_cost)
3514
4.72M
                {
3515
4.72M
                    update_required = 0;
3516
3517
                    /*************************************************************/
3518
                    /* Identify where the current result isto be placed.Basically*/
3519
                    /* find the node which has cost just higher thannodeundertest*/
3520
                    /*************************************************************/
3521
4.72M
                    if(i4_tot_cost < best_node_cost)
3522
233k
                    {
3523
233k
                        update_required = 1;
3524
233k
                    }
3525
4.49M
                    else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3526
1.60M
                    {
3527
1.60M
                        update_required = 0;
3528
1.60M
                    }
3529
4.72M
                    if(update_required == 2)
3530
0
                    {
3531
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3532
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3533
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3534
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3535
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3536
0
                    }
3537
4.72M
                    else if(update_required == 1)
3538
233k
                    {
3539
233k
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3540
233k
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3541
233k
                        ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3542
233k
                        ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3543
233k
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3544
233k
                    }
3545
4.72M
                }
3546
4.72M
            }
3547
1.73M
        }
3548
1.73M
    }
3549
3550
1.73M
    {
3551
1.73M
        WORD32 i4_count = 0;
3552
31.2M
        for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3553
29.5M
        {
3554
29.5M
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3555
24.8M
            {
3556
24.8M
                ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3557
24.8M
            }
3558
29.5M
        }
3559
1.73M
    }
3560
1.73M
}
3561
3562
/**
3563
********************************************************************************
3564
*  @fn     hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3565
*                                              wgt_pred_ctxt_t *ps_wt_inp_prms,
3566
*                                              err_prms_t *ps_err_prms,
3567
*                                              result_upd_prms_t *ps_result_prms,
3568
*                                              U08 **ppu1_ref,
3569
*                                              S32 i4_ref_stride)
3570
*
3571
*  @brief   Run thorugh the provided candidates and compute the point SAD and
3572
*           cost and update the results in the order
3573
*
3574
*  @param[in]  ps_search_prms
3575
*  @param[in]  ps_wt_inp_prms
3576
*  @param[in]  ps_err_prms
3577
*  @param[out] ps_result_prms
3578
*  @param[in]  ppu1_ref
3579
*  @param[in]  i4_ref_stride
3580
*
3581
*  @return   None
3582
********************************************************************************
3583
*/
3584
3585
void hme_calc_pt_sad_and_result_explicit(
3586
    hme_search_prms_t *ps_search_prms,
3587
    wgt_pred_ctxt_t *ps_wt_inp_prms,
3588
    err_prms_t *ps_err_prms,
3589
    result_upd_prms_t *ps_result_prms,
3590
    U08 **ppu1_ref,
3591
    S32 i4_ref_stride)
3592
811k
{
3593
811k
    WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3594
811k
    WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3595
3596
811k
    search_node_t *ps_search_node;
3597
811k
    BLK_SIZE_T e_blk_size;
3598
811k
    PF_SAD_FXN_T pf_sad_fxn;
3599
811k
    PF_RESULT_FXN_T pf_hme_result_fxn;
3600
3601
811k
    i4_grid_mask = 0x1; /* Point SAD */
3602
3603
    /* Get the parameters required */
3604
811k
    i4_part_mask = ps_search_prms->i4_part_mask;
3605
811k
    e_blk_size = ps_search_prms->e_blk_size;
3606
811k
    i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3607
811k
    i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3608
811k
    ps_search_node = ps_search_prms->ps_search_nodes;
3609
3610
811k
    i4_inp_stride = ps_search_prms->i4_inp_stride;
3611
    /* Move to the location of the search blk in inp buffer */
3612
811k
    i4_inp_off = ps_search_prms->i4_cu_x_off;
3613
811k
    i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3614
811k
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3615
3616
811k
    pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3617
    /**********************************************************************/
3618
    /* we have a sparsely populated SAD grid of size 9x17.                */
3619
    /* the id of the results in the grid is shown                         */
3620
    /*     5   2   6                                                      */
3621
    /*     1   0   3                                                      */
3622
    /*     7   4   8                                                      */
3623
    /* The motivation for choosing a grid like this is that               */
3624
    /* in case of no refinement, the central location is                  */
3625
    /* the first entry in the grid                                        */
3626
    /* Also for diamond, the 4 entries get considered first               */
3627
    /* This is consistent with the diamond notation used in               */
3628
    /* subpel refinement. To Check                                        */
3629
    /* Update the results for the given search candt                      */
3630
    /* returns the cost of the 2Nx2N partition                            */
3631
    /**********************************************************************/
3632
3633
    /* Get the modified update result fun. with CLIP16 of cost to match   */
3634
    /* with SIMD */
3635
811k
    pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3636
3637
7.69M
    for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3638
6.88M
    {
3639
6.88M
        if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3640
0
            continue;
3641
3642
        /* initialize minimum cost for this candidate. As we search around */
3643
        /* this candidate, this is used to check early exit, when in any   */
3644
        /* given iteration, the center pt of the grid is lowest value      */
3645
6.88M
        ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3646
3647
6.88M
        ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3648
6.88M
        ps_err_prms->i4_grid_mask = i4_grid_mask;
3649
3650
6.88M
        ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3651
6.88M
        ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3652
6.88M
        ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3653
3654
        /**********************************************************************/
3655
        /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3656
        /**********************************************************************/
3657
6.88M
        pf_sad_fxn(ps_err_prms);
3658
3659
        /**********************************************************************/
3660
        /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3661
        /**********************************************************************/
3662
6.88M
        ps_result_prms->i4_grid_mask = i4_grid_mask;
3663
6.88M
        ps_result_prms->ps_search_node_base = ps_search_node;
3664
6.88M
        pf_hme_result_fxn(ps_result_prms);
3665
3666
6.88M
        ps_search_node++;
3667
6.88M
    }
3668
811k
}
3669
3670
/**
3671
********************************************************************************
3672
*  @fn     hme_set_mvp_node(search_results_t *ps_search_results,
3673
*                           search_node_t *ps_candt_prj_coloc,
3674
*                           S08 i1_ref_idx)
3675
*
3676
*  @brief   Set node used for motion vector predictor computation
3677
*           Either TR or L is compared to projected colocated and
3678
*           closest is decided as MVP
3679
*
3680
*  @param[in]  ps_search_results
3681
*
3682
*  @param[in]  ps_candt_prj_coloc
3683
*
3684
*  @param[in]  i1_ref_idx
3685
*
3686
*  @return   None
3687
********************************************************************************
3688
*/
3689
void hme_set_mvp_node(
3690
    search_results_t *ps_search_results,
3691
    search_node_t *ps_candt_prj_coloc,
3692
    U08 u1_pred_lx,
3693
    U08 u1_default_ref_id)
3694
399k
{
3695
399k
    S32 i;
3696
399k
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3697
399k
    pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3698
399k
    search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3699
3700
399k
    S32 inp_shift = 2;
3701
399k
    S32 pred_shift;
3702
399k
    S32 ref_bits;
3703
399k
    S32 mv_p_x, mv_p_y;
3704
399k
    S16 mvdx1, mvdx2, mvdy1, mvdy2;
3705
3706
399k
    ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3707
3708
    /*************************************************************************/
3709
    /* Priority to bottom left availability. Else we go to left. If both are */
3710
    /* not available, then a remains null                                    */
3711
    /*************************************************************************/
3712
399k
    if(ps_pred_nodes->ps_l->u1_is_avail)
3713
348k
    {
3714
348k
        ps_pred_node_a = ps_pred_nodes->ps_l;
3715
348k
    }
3716
3717
399k
    if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3718
173k
    {
3719
173k
        ps_pred_node_b = ps_pred_nodes->ps_tr;
3720
173k
    }
3721
226k
    else
3722
226k
    {
3723
226k
        ps_pred_node_b = ps_pred_nodes->ps_coloc;
3724
226k
        ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3725
226k
    }
3726
3727
399k
    if(ps_pred_node_a == NULL)
3728
51.0k
    {
3729
51.0k
        ps_pred_node_a = ps_pred_nodes->ps_coloc;
3730
51.0k
        ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3731
3732
51.0k
        if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3733
14.1k
        {
3734
14.1k
            ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3735
14.1k
            ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3736
14.1k
        }
3737
51.0k
    }
3738
3739
399k
    if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3740
123k
    {
3741
123k
        SCALE_FOR_POC_DELTA(
3742
123k
            mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3743
123k
    }
3744
276k
    else
3745
276k
    {
3746
276k
        mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3747
276k
        mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3748
276k
    }
3749
399k
    pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3750
399k
    COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3751
399k
    mvdx1 = ABS(mvdx1);
3752
399k
    mvdy1 = ABS(mvdy1);
3753
3754
399k
    if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3755
124k
    {
3756
124k
        SCALE_FOR_POC_DELTA(
3757
124k
            mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3758
124k
    }
3759
275k
    else
3760
275k
    {
3761
275k
        mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3762
275k
        mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3763
275k
    }
3764
399k
    pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3765
399k
    COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3766
399k
    mvdx2 = ABS(mvdx2);
3767
399k
    mvdy2 = ABS(mvdy2);
3768
3769
399k
    if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3770
90.8k
    {
3771
1.63M
        for(i = 0; i < TOT_NUM_PARTS; i++)
3772
1.54M
        {
3773
1.54M
            ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3774
1.54M
        }
3775
90.8k
    }
3776
309k
    else
3777
309k
    {
3778
5.56M
        for(i = 0; i < TOT_NUM_PARTS; i++)
3779
5.25M
        {
3780
5.25M
            ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3781
5.25M
        }
3782
309k
    }
3783
399k
}