Coverage Report

Created: 2026-04-12 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/hme_subpel.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/**
22
******************************************************************************
23
* @file hme_subpel.c
24
*
25
* @brief
26
*    Subpel refinement modules for ME algo
27
*
28
* @author
29
*    Ittiam
30
*
31
*
32
* List of Functions
33
* hme_qpel_interp_avg()
34
* hme_subpel_refine_ctblist_bck()
35
* hme_subpel_refine_ctblist_fwd()
36
* hme_refine_bidirect()
37
* hme_subpel_refinement()
38
* hme_subpel_refine_ctb_fwd()
39
* hme_subpel_refine_ctb_bck()
40
* hme_create_bck_inp()
41
* hme_subpel_refine_search_node()
42
******************************************************************************
43
*/
44
45
/*****************************************************************************/
46
/* File Includes                                                             */
47
/*****************************************************************************/
48
/* System include files */
49
#include <stdio.h>
50
#include <string.h>
51
#include <stdlib.h>
52
#include <assert.h>
53
#include <stdarg.h>
54
#include <math.h>
55
#include <limits.h>
56
57
/* User include files */
58
#include "ihevc_typedefs.h"
59
#include "itt_video_api.h"
60
#include "ihevce_api.h"
61
62
#include "rc_cntrl_param.h"
63
#include "rc_frame_info_collector.h"
64
#include "rc_look_ahead_params.h"
65
66
#include "ihevc_defs.h"
67
#include "ihevc_structs.h"
68
#include "ihevc_platform_macros.h"
69
#include "ihevc_deblk.h"
70
#include "ihevc_itrans_recon.h"
71
#include "ihevc_chroma_itrans_recon.h"
72
#include "ihevc_chroma_intra_pred.h"
73
#include "ihevc_intra_pred.h"
74
#include "ihevc_inter_pred.h"
75
#include "ihevc_mem_fns.h"
76
#include "ihevc_padding.h"
77
#include "ihevc_weighted_pred.h"
78
#include "ihevc_sao.h"
79
#include "ihevc_resi_trans.h"
80
#include "ihevc_quant_iquant_ssd.h"
81
#include "ihevc_cabac_tables.h"
82
83
#include "ihevce_defs.h"
84
#include "ihevce_lap_enc_structs.h"
85
#include "ihevce_multi_thrd_structs.h"
86
#include "ihevce_multi_thrd_funcs.h"
87
#include "ihevce_me_common_defs.h"
88
#include "ihevce_had_satd.h"
89
#include "ihevce_error_codes.h"
90
#include "ihevce_bitstream.h"
91
#include "ihevce_cabac.h"
92
#include "ihevce_rdoq_macros.h"
93
#include "ihevce_function_selector.h"
94
#include "ihevce_enc_structs.h"
95
#include "ihevce_entropy_structs.h"
96
#include "ihevce_cmn_utils_instr_set_router.h"
97
#include "ihevce_enc_loop_structs.h"
98
#include "ihevce_bs_compute_ctb.h"
99
#include "ihevce_global_tables.h"
100
#include "ihevce_dep_mngr_interface.h"
101
#include "hme_datatype.h"
102
#include "hme_interface.h"
103
#include "hme_common_defs.h"
104
#include "hme_defs.h"
105
#include "ihevce_me_instr_set_router.h"
106
#include "hme_globals.h"
107
#include "hme_utils.h"
108
#include "hme_coarse.h"
109
#include "hme_fullpel.h"
110
#include "hme_subpel.h"
111
#include "hme_refine.h"
112
#include "hme_err_compute.h"
113
#include "hme_common_utils.h"
114
#include "hme_search_algo.h"
115
#include "ihevce_stasino_helpers.h"
116
#include "ihevce_common_utils.h"
117
118
/*****************************************************************************/
119
/* Function Definitions                                                      */
120
/*****************************************************************************/
121
void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
122
565k
{
123
565k
    U08 *pu1_src1, *pu1_src2, *pu1_dst;
124
565k
    qpel_input_buf_cfg_t *ps_inp_cfg;
125
565k
    S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
126
127
    /*************************************************************************/
128
    /* For a given QPEL pt, we need to determine the 2 source pts that are   */
129
    /* needed to do the QPEL averaging. The logic to do this is as follows   */
130
    /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
131
    /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
132
    /* pt of th reference blk that is colocated to the inp blk.              */
133
    /*    A j E k B                                                          */
134
    /*    l m n o p                                                          */
135
    /*    F q G r H                                                          */
136
    /*    s t u v w                                                          */
137
    /*    C x I y D                                                          */
138
    /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
139
    /* and (1,1) respectively in the fpel buffer (id = 0)                    */
140
    /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
141
    /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
142
    /* G is hxhy pt in offset 0,0 in hxhy buf                                */
143
    /* All above offsets are computed w.r.t. motion displaced pt in          */
144
    /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
145
    /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
146
    /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
147
    /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
148
    /* v is avg of H and I. So the table look up of v should give following  */
149
    /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
150
    /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
151
    /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
152
    /*************************************************************************/
153
565k
    i4_mv_x_frac = i4_mv_x & 3;
154
565k
    i4_mv_y_frac = i4_mv_y & 3;
155
156
565k
    i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
157
158
    /* Derive the descriptor that has all offset and size info */
159
565k
    ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
160
161
565k
    if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
162
463k
    {
163
        /* This is case for fxfy/hxfy/fxhy/hxhy */
164
463k
        ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
165
463k
        ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
166
463k
        ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
167
463k
        ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
168
169
463k
        return;
170
463k
    }
171
172
102k
    pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
173
102k
    pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
174
102k
    pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
175
176
102k
    pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
177
102k
    pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
178
102k
    pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
179
180
102k
    pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
181
102k
    hevc_avg_2d(
182
102k
        pu1_src1,
183
102k
        pu1_src2,
184
102k
        ps_prms->i4_ref_stride,
185
102k
        ps_prms->i4_ref_stride,
186
102k
        ps_prms->i4_blk_wd,
187
102k
        ps_prms->i4_blk_ht,
188
102k
        pu1_dst,
189
102k
        ps_prms->i4_out_stride);
190
102k
    ps_prms->pu1_final_out = pu1_dst;
191
102k
    ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
192
102k
}
193
194
static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
195
    interp_prms_t *ps_prms,
196
    S32 i4_mv_x,
197
    S32 i4_mv_y,
198
    U08 **ppu1_final,
199
    S32 *pi4_final_stride,
200
    FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
201
14.5k
{
202
14.5k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
203
204
14.5k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
205
14.5k
}
206
207
static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
208
    interp_prms_t *ps_prms,
209
    S32 i4_mv_x,
210
    S32 i4_mv_y,
211
    U08 **ppu1_final,
212
    S32 *pi4_final_stride,
213
    FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
214
8.17k
{
215
8.17k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
216
217
8.17k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
218
8.17k
}
219
220
/********************************************************************************
221
*  @fn     hme_qpel_interp_comprehensive
222
*
223
*  @brief  Interpolates 2 qpel points by hpel averaging
224
*
225
*  @param[in,out]  ps_prms: Both input buffer ptrs and location of output
226
*
227
*  @param[in]  i4_mv_x : x component of motion vector in QPEL units
228
*
229
*  @param[in]  i4_mv_y : y component of motion vector in QPEL units
230
*
231
*  @param[in]  i4_grid_mask : mask which determines qpels to be computed
232
*
233
*  @param[out]  ppu1_final : storage for final buffer pointers
234
*
235
*  @param[out]  pi4_final_stride : storage for final buffer strides
236
*
237
*  @return None
238
********************************************************************************
239
*/
240
static __inline void hme_qpel_interp_comprehensive(
241
    interp_prms_t *ps_prms,
242
    U08 **ppu1_final,
243
    S32 *pi4_final_stride,
244
    S32 i4_mv_x,
245
    S32 i4_mv_y,
246
    S32 i4_grid_mask,
247
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
248
373k
{
249
373k
    S32 pt_select_for_TB, pt_select_for_LR;
250
373k
    S32 dx, dy, dydx;
251
373k
    S32 vert_func_selector, horz_func_selector;
252
253
373k
    S32 i4_ref_stride = ps_prms->i4_ref_stride;
254
255
373k
    pt_select_for_TB =
256
373k
        ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
257
258
373k
    pt_select_for_LR =
259
373k
        ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
260
261
373k
    dx = (i4_mv_x & 3);
262
373k
    dy = (i4_mv_y & 3);
263
373k
    dydx = (dx + (dy << 2));
264
265
373k
    vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
266
373k
    horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
267
268
    /* case descriptions */
269
    /* Let T = (gridmask & T) & B = (gridmask & B) */
270
    /* & hp = pt is an hpel or an fpel */
271
    /* & r = reuse possible */
272
    /* 0 => T || B = 0 */
273
    /* 1 => (!T) && (B) && hp */
274
    /* 2 => (T) && (!B) && hp */
275
    /* 3 => (!T) && (B) && !hp */
276
    /* 4 => (T) && (!B) && !hp */
277
    /* 5 => (T) && (B) && !hp && r */
278
    /* 6 => (T) && (B) && !hp && !r */
279
    /* 7 => (T) && (B) && hp */
280
281
373k
    switch(vert_func_selector)
282
373k
    {
283
0
    case 0:
284
0
    {
285
0
        break;
286
0
    }
287
25.1k
    case 1:
288
25.1k
    {
289
25.1k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
290
25.1k
        qpel_input_buf_cfg_t *ps_inp_cfg;
291
25.1k
        S32 i4_mvyp1 = (i4_mv_y + 1);
292
293
25.1k
        i4_mv_x_frac = dx;
294
25.1k
        i4_mv_y_frac = i4_mvyp1 & 3;
295
296
25.1k
        i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
297
298
        /* Derive the descriptor that has all offset and size info */
299
25.1k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
300
301
25.1k
        ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
302
25.1k
        ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
303
25.1k
        ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
304
25.1k
        pi4_final_stride[3] = i4_ref_stride;
305
306
25.1k
        break;
307
0
    }
308
24.8k
    case 2:
309
24.8k
    {
310
24.8k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
311
24.8k
        qpel_input_buf_cfg_t *ps_inp_cfg;
312
24.8k
        S32 i4_mvym1 = (i4_mv_y - 1);
313
314
24.8k
        i4_mv_x_frac = dx;
315
24.8k
        i4_mv_y_frac = i4_mvym1 & 3;
316
317
24.8k
        i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
318
319
        /* Derive the descriptor that has all offset and size info */
320
24.8k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
321
322
24.8k
        ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
323
24.8k
        ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
324
24.8k
        ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
325
24.8k
        pi4_final_stride[1] = i4_ref_stride;
326
327
24.8k
        break;
328
0
    }
329
4.29k
    case 3:
330
4.29k
    {
331
4.29k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
332
4.29k
            ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
333
334
4.29k
        break;
335
0
    }
336
4.28k
    case 4:
337
4.28k
    {
338
4.28k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
339
4.28k
            ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
340
341
4.28k
        break;
342
0
    }
343
300k
    case 5:
344
300k
    {
345
300k
        ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
346
300k
            ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
347
300k
        break;
348
0
    }
349
14.5k
    case 6:
350
14.5k
    {
351
14.5k
        hme_qpel_interp_avg_2pt_vert_no_reuse(
352
14.5k
            ps_prms,
353
14.5k
            i4_mv_x,
354
14.5k
            i4_mv_y,
355
14.5k
            ppu1_final,
356
14.5k
            pi4_final_stride,
357
14.5k
            ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
358
14.5k
        break;
359
0
    }
360
0
    case 7:
361
0
    {
362
0
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
363
0
        qpel_input_buf_cfg_t *ps_inp_cfg;
364
365
0
        S32 i4_mvyp1 = (i4_mv_y + 1);
366
0
        S32 i4_mvym1 = (i4_mv_y - 1);
367
368
0
        i4_mv_x_frac = dx;
369
0
        i4_mv_y_frac = i4_mvyp1 & 3;
370
371
0
        i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
372
373
        /* Derive the descriptor that has all offset and size info */
374
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
375
376
0
        ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
377
0
        ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
378
0
        ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
379
0
        pi4_final_stride[3] = i4_ref_stride;
380
381
0
        i4_mv_y_frac = i4_mvym1 & 3;
382
383
0
        i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
384
385
        /* Derive the descriptor that has all offset and size info */
386
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
387
388
0
        ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
389
0
        ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
390
0
        ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
391
0
        pi4_final_stride[1] = i4_ref_stride;
392
393
0
        break;
394
0
    }
395
373k
    }
396
397
    /* case descriptions */
398
    /* Let L = (gridmask & L) & R = (gridmask & R) */
399
    /* & hp = pt is an hpel or an fpel */
400
    /* & r = reuse possible */
401
    /* 0 => L || R = 0 */
402
    /* 1 => (!L) && (R) && hp */
403
    /* 2 => (L) && (!R) && hp */
404
    /* 3 => (!L) && (R) && !hp */
405
    /* 4 => (L) && (!R) && !hp */
406
    /* 5 => (L) && (R) && !hp && r */
407
    /* 6 => (L) && (R) && !hp && !r */
408
    /* 7 => (L) && (R) && hp */
409
410
373k
    switch(horz_func_selector)
411
373k
    {
412
0
    case 0:
413
0
    {
414
0
        break;
415
0
    }
416
18.3k
    case 1:
417
18.3k
    {
418
18.3k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
419
18.3k
        qpel_input_buf_cfg_t *ps_inp_cfg;
420
18.3k
        S32 i4_mvxp1 = (i4_mv_x + 1);
421
422
18.3k
        i4_mv_x_frac = i4_mvxp1 & 3;
423
18.3k
        i4_mv_y_frac = dy;
424
425
18.3k
        i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
426
427
        /* Derive the descriptor that has all offset and size info */
428
18.3k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
429
430
18.3k
        ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
431
18.3k
        ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
432
18.3k
        ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
433
18.3k
        pi4_final_stride[2] = i4_ref_stride;
434
435
18.3k
        break;
436
0
    }
437
18.9k
    case 2:
438
18.9k
    {
439
18.9k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
440
18.9k
        qpel_input_buf_cfg_t *ps_inp_cfg;
441
18.9k
        S32 i4_mvxm1 = (i4_mv_x - 1);
442
443
18.9k
        i4_mv_x_frac = i4_mvxm1 & 3;
444
18.9k
        i4_mv_y_frac = dy;
445
446
18.9k
        i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
447
448
        /* Derive the descriptor that has all offset and size info */
449
18.9k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
450
451
18.9k
        ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
452
18.9k
        ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
453
18.9k
        ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
454
18.9k
        pi4_final_stride[0] = i4_ref_stride;
455
456
18.9k
        break;
457
0
    }
458
7.86k
    case 3:
459
7.86k
    {
460
7.86k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
461
7.86k
            ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
462
463
7.86k
        break;
464
0
    }
465
7.83k
    case 4:
466
7.83k
    {
467
7.83k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
468
7.83k
            ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
469
470
7.83k
        break;
471
0
    }
472
312k
    case 5:
473
312k
    {
474
312k
        ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
475
312k
            ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
476
312k
        break;
477
0
    }
478
8.17k
    case 6:
479
8.17k
    {
480
8.17k
        hme_qpel_interp_avg_2pt_horz_no_reuse(
481
8.17k
            ps_prms,
482
8.17k
            i4_mv_x,
483
8.17k
            i4_mv_y,
484
8.17k
            ppu1_final,
485
8.17k
            pi4_final_stride,
486
8.17k
            ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
487
8.17k
        break;
488
0
    }
489
0
    case 7:
490
0
    {
491
0
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
492
0
        qpel_input_buf_cfg_t *ps_inp_cfg;
493
494
0
        S32 i4_mvxp1 = (i4_mv_x + 1);
495
0
        S32 i4_mvxm1 = (i4_mv_x - 1);
496
497
0
        i4_mv_x_frac = i4_mvxp1 & 3;
498
0
        i4_mv_y_frac = dy;
499
500
0
        i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
501
502
        /* Derive the descriptor that has all offset and size info */
503
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
504
505
0
        ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
506
0
        ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
507
0
        ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
508
0
        pi4_final_stride[2] = i4_ref_stride;
509
510
0
        i4_mv_x_frac = i4_mvxm1 & 3;
511
512
0
        i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
513
514
        /* Derive the descriptor that has all offset and size info */
515
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
516
517
0
        ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
518
0
        ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
519
0
        ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
520
0
        pi4_final_stride[0] = i4_ref_stride;
521
522
0
        break;
523
0
    }
524
373k
    }
525
373k
}
526
527
/**
528
********************************************************************************
529
*  @fn     S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
530
*                                   search_results_t *ps_search_results,
531
*                                   layer_ctxt_t *ps_curr_layer,
532
*                                   U08 **ppu1_pred)
533
*
534
*
535
*  @brief  Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
536
*          best L0 and L1 bufs respectively for the entire CU
537
*
538
*  @param[in]  ps_prms: subpel prms input to this function
539
*
540
*  @param[in] ps_curr_layer: points to the current layer ctxt
541
*
542
*  @return The best BI cost of best uni cost, whichever better
543
********************************************************************************
544
*/
545
void hme_compute_pred_and_evaluate_bi(
546
    inter_cu_results_t *ps_cu_results,
547
    inter_pu_results_t *ps_pu_results,
548
    inter_ctb_prms_t *ps_inter_ctb_prms,
549
    part_type_results_t *ps_part_type_result,
550
    ULWORD64 *pu8_winning_pred_sigmaXSquare,
551
    ULWORD64 *pu8_winning_pred_sigmaX,
552
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
553
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
554
397k
{
555
    /* Idx0 - Uni winner */
556
    /* Idx1 - Uni runner-up */
557
    /* Idx2 - Bi winner */
558
397k
    hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
559
397k
    err_prms_t s_err_prms;
560
397k
    interp_prms_t s_interp_prms;
561
562
397k
    PF_SAD_FXN_T pf_err_compute;
563
564
397k
    S32 i, j;
565
397k
    S32 x_off, y_off, x_pic, y_pic;
566
397k
    S32 i4_sad_grid;
567
397k
    U08 e_cu_size;
568
397k
    S32 i4_part_type;
569
397k
    U08 u1_cu_size;
570
397k
    S32 shift;
571
397k
    S32 x_part, y_part, num_parts;
572
397k
    S32 inp_stride, ref_stride;
573
397k
    U08 au1_pred_buf_array_indixes[3];
574
397k
    S32 cur_iter_best_cost;
575
397k
    S32 uni_cost, bi_cost, best_cost, tot_cost;
576
    /* Idx0 - Uni winner */
577
    /* Idx1 - Bi winner */
578
397k
    ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
579
397k
    ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
580
397k
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
581
397k
    S32 i4_noise_term;
582
397k
#endif
583
584
397k
    interp_prms_t *ps_interp_prms = &s_interp_prms;
585
586
397k
    S32 best_cand_in_opp_dir_idx = 0;
587
397k
    S32 is_best_cand_an_intra = 0;
588
397k
    U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
589
397k
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
590
397k
    const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
591
397k
#endif
592
397k
    tot_cost = 0;
593
594
    /* Start of the CU w.r.t. CTB */
595
397k
    x_off = ps_cu_results->u1_x_off;
596
397k
    y_off = ps_cu_results->u1_y_off;
597
598
397k
    inp_stride = ps_inter_ctb_prms->i4_inp_stride;
599
397k
    ref_stride = ps_inter_ctb_prms->i4_rec_stride;
600
601
397k
    ps_interp_prms->i4_ref_stride = ref_stride;
602
603
    /* Start of the CU w.r.t. Pic 0,0 */
604
397k
    x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
605
397k
    y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
606
607
397k
    u1_cu_size = ps_cu_results->u1_cu_size;
608
397k
    e_cu_size = u1_cu_size;
609
397k
    shift = (S32)e_cu_size;
610
397k
    i4_part_type = ps_part_type_result->u1_part_type;
611
397k
    num_parts = gau1_num_parts_in_part_type[i4_part_type];
612
613
1.58M
    for(i = 0; i < 3; i++)
614
1.19M
    {
615
1.19M
        hme_init_pred_buf_info(
616
1.19M
            &as_pred_buf_data[i],
617
1.19M
            &ps_inter_ctb_prms->s_pred_buf_mngr,
618
1.19M
            (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
619
1.19M
            (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
620
1.19M
            (PART_TYPE_T)i4_part_type);
621
622
1.19M
        au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
623
1.19M
    }
624
625
895k
    for(j = 0; j < num_parts; j++)
626
498k
    {
627
498k
        UWORD8 *apu1_hpel_ref[2][4];
628
498k
        PART_ID_T e_part_id;
629
498k
        BLK_SIZE_T e_blk_size;
630
498k
        WORD8 i1_ref_idx;
631
498k
        UWORD8 pred_dir;
632
498k
        WORD32 ref_offset, inp_offset, wd, ht;
633
498k
        pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
634
498k
        mv_t *aps_mv[2];
635
498k
        UWORD8 num_active_ref_opp;
636
498k
        UWORD8 num_results_per_part;
637
498k
        WORD32 luma_weight_ref1, luma_offset_ref1;
638
498k
        WORD32 luma_weight_ref2, luma_offset_ref2;
639
498k
        WORD32 pu_node2_found = 0;
640
641
498k
        e_part_id = ge_part_type_to_part_id[i4_part_type][j];
642
498k
        e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
643
644
498k
        x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
645
498k
        y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
646
647
498k
        ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
648
498k
        inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
649
650
498k
        pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
651
652
498k
        ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
653
654
498k
        if(PRED_L0 == pred_dir)
655
489k
        {
656
489k
            i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
657
489k
            aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
658
659
489k
            num_active_ref_opp =
660
489k
                ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
661
489k
            num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
662
663
489k
            ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
664
665
489k
            ASSERT(i1_ref_idx >= 0);
666
667
489k
            apu1_hpel_ref[0][0] =
668
489k
                (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
669
489k
                ref_offset;
670
489k
            apu1_hpel_ref[0][1] =
671
489k
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
672
489k
                ref_offset;
673
489k
            apu1_hpel_ref[0][2] =
674
489k
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
675
489k
                ref_offset;
676
489k
            apu1_hpel_ref[0][3] =
677
489k
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
678
489k
                ref_offset;
679
680
489k
            luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
681
489k
                                   ->s_weight_offset.i2_luma_weight;
682
489k
            luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
683
489k
                                   ->s_weight_offset.i2_luma_offset;
684
489k
        }
685
9.00k
        else
686
9.00k
        {
687
9.00k
            i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
688
9.00k
            aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
689
690
9.00k
            ASSERT(i1_ref_idx >= 0);
691
692
9.00k
            num_active_ref_opp =
693
9.00k
                ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
694
9.00k
            num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
695
696
9.00k
            ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
697
698
9.00k
            apu1_hpel_ref[0][0] =
699
9.00k
                (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
700
9.00k
                ref_offset;
701
9.00k
            apu1_hpel_ref[0][1] =
702
9.00k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
703
9.00k
                ref_offset;
704
9.00k
            apu1_hpel_ref[0][2] =
705
9.00k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
706
9.00k
                ref_offset;
707
9.00k
            apu1_hpel_ref[0][3] =
708
9.00k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
709
9.00k
                ref_offset;
710
711
9.00k
            luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
712
9.00k
                                   ->s_weight_offset.i2_luma_weight;
713
9.00k
            luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
714
9.00k
                                   ->s_weight_offset.i2_luma_offset;
715
9.00k
        }
716
717
498k
        if(aps_mv[0]->i2_mvx == INTRA_MV)
718
0
        {
719
0
            uni_cost = ps_pu_node1->i4_tot_cost;
720
0
            cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
721
0
            best_cost = MIN(uni_cost, cur_iter_best_cost);
722
0
            tot_cost += best_cost;
723
0
            continue;
724
0
        }
725
726
498k
        ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
727
498k
        ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
728
498k
        ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
729
730
498k
        if(num_active_ref_opp)
731
23.9k
        {
732
23.9k
            if(PRED_L0 == pred_dir)
733
14.9k
            {
734
14.9k
                if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
735
14.5k
                {
736
14.5k
                    ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
737
14.5k
                    pu_node2_found = 1;
738
14.5k
                }
739
14.9k
            }
740
9.00k
            else
741
9.00k
            {
742
9.00k
                if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
743
9.00k
                {
744
9.00k
                    ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
745
9.00k
                    pu_node2_found = 1;
746
9.00k
                }
747
9.00k
            }
748
23.9k
        }
749
750
498k
        if(!pu_node2_found)
751
474k
        {
752
474k
            bi_cost = INT_MAX >> 1;
753
754
474k
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
755
474k
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
756
757
474k
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
758
474k
                ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
759
760
474k
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
761
387k
            {
762
387k
                as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
763
387k
                as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
764
387k
                as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
765
387k
            }
766
767
474k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
768
0
            {
769
0
                hme_compute_sigmaX_and_sigmaXSquared(
770
0
                    as_pred_buf_data[0][j].pu1_pred,
771
0
                    as_pred_buf_data[0][j].i4_pred_stride,
772
0
                    &au8_sigmaX[0][j],
773
0
                    &au8_sigmaXSquared[0][j],
774
0
                    ps_interp_prms->i4_blk_wd,
775
0
                    ps_interp_prms->i4_blk_ht,
776
0
                    ps_interp_prms->i4_blk_wd,
777
0
                    ps_interp_prms->i4_blk_ht,
778
0
                    0,
779
0
                    1);
780
0
            }
781
474k
        }
782
23.5k
        else
783
23.5k
        {
784
23.5k
            i = 0;
785
23.5k
            bi_cost = MAX_32BIT_VAL;
786
23.5k
            is_best_cand_an_intra = 0;
787
23.5k
            best_cand_in_opp_dir_idx = 0;
788
789
23.5k
            pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
790
791
23.5k
            if(PRED_L0 == pred_dir)
792
9.00k
            {
793
9.00k
                i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
794
9.00k
                aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
795
796
9.00k
                ASSERT(i1_ref_idx >= 0);
797
798
9.00k
                apu1_hpel_ref[1][0] =
799
9.00k
                    (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
800
9.00k
                                   ->s_yuv_buf_desc.pv_y_buf) +
801
9.00k
                    ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
802
9.00k
                apu1_hpel_ref[1][1] =
803
9.00k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
804
9.00k
                    ref_offset;
805
9.00k
                apu1_hpel_ref[1][2] =
806
9.00k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
807
9.00k
                    ref_offset;
808
9.00k
                apu1_hpel_ref[1][3] =
809
9.00k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
810
9.00k
                    ref_offset;
811
812
9.00k
                luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
813
9.00k
                                       ->s_weight_offset.i2_luma_weight;
814
9.00k
                luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
815
9.00k
                                       ->s_weight_offset.i2_luma_offset;
816
9.00k
            }
817
14.5k
            else
818
14.5k
            {
819
14.5k
                i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
820
14.5k
                aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
821
822
14.5k
                ASSERT(i1_ref_idx >= 0);
823
824
14.5k
                apu1_hpel_ref[1][0] =
825
14.5k
                    (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
826
14.5k
                                   ->s_yuv_buf_desc.pv_y_buf) +
827
14.5k
                    ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
828
14.5k
                apu1_hpel_ref[1][1] =
829
14.5k
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
830
14.5k
                    ref_offset;
831
14.5k
                apu1_hpel_ref[1][2] =
832
14.5k
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
833
14.5k
                    ref_offset;
834
14.5k
                apu1_hpel_ref[1][3] =
835
14.5k
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
836
14.5k
                    ref_offset;
837
838
14.5k
                luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
839
14.5k
                                       ->s_weight_offset.i2_luma_weight;
840
14.5k
                luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
841
14.5k
                                       ->s_weight_offset.i2_luma_offset;
842
14.5k
            }
843
844
23.5k
            if(aps_mv[1]->i2_mvx == INTRA_MV)
845
0
            {
846
0
                uni_cost = ps_pu_node1->i4_tot_cost;
847
0
                cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
848
849
0
                if(cur_iter_best_cost < bi_cost)
850
0
                {
851
0
                    bi_cost = cur_iter_best_cost;
852
0
                    best_cand_in_opp_dir_idx = i;
853
0
                    is_best_cand_an_intra = 1;
854
0
                }
855
856
0
                best_cost = MIN(uni_cost, bi_cost);
857
0
                tot_cost += best_cost;
858
0
                continue;
859
0
            }
860
861
23.5k
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
862
23.5k
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
863
864
23.5k
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
865
23.5k
                ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
866
867
23.5k
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
868
20.2k
            {
869
20.2k
                as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
870
20.2k
                as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
871
20.2k
                as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
872
20.2k
            }
873
874
23.5k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
875
0
            {
876
0
                hme_compute_sigmaX_and_sigmaXSquared(
877
0
                    as_pred_buf_data[0][j].pu1_pred,
878
0
                    as_pred_buf_data[0][j].i4_pred_stride,
879
0
                    &au8_sigmaX[0][j],
880
0
                    &au8_sigmaXSquared[0][j],
881
0
                    ps_interp_prms->i4_blk_wd,
882
0
                    ps_interp_prms->i4_blk_ht,
883
0
                    ps_interp_prms->i4_blk_wd,
884
0
                    ps_interp_prms->i4_blk_ht,
885
0
                    0,
886
0
                    1);
887
0
            }
888
889
23.5k
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
890
23.5k
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
891
892
23.5k
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
893
23.5k
                ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
894
895
23.5k
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
896
19.6k
            {
897
19.6k
                as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
898
19.6k
                as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
899
19.6k
                as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
900
19.6k
            }
901
902
23.5k
            ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
903
23.5k
                as_pred_buf_data[0][j].pu1_pred,
904
23.5k
                as_pred_buf_data[1][j].pu1_pred,
905
23.5k
                as_pred_buf_data[0][j].i4_pred_stride,
906
23.5k
                as_pred_buf_data[1][j].i4_pred_stride,
907
23.5k
                wd,
908
23.5k
                ht,
909
23.5k
                as_pred_buf_data[2][j].pu1_pred,
910
23.5k
                as_pred_buf_data[2][j].i4_pred_stride,
911
23.5k
                luma_weight_ref1,
912
23.5k
                luma_weight_ref2,
913
23.5k
                luma_offset_ref1,
914
23.5k
                luma_offset_ref2,
915
23.5k
                ps_inter_ctb_prms->wpred_log_wdc);
916
917
23.5k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
918
0
            {
919
0
                hme_compute_sigmaX_and_sigmaXSquared(
920
0
                    as_pred_buf_data[2][j].pu1_pred,
921
0
                    as_pred_buf_data[2][j].i4_pred_stride,
922
0
                    &au8_sigmaX[1][j],
923
0
                    &au8_sigmaXSquared[1][j],
924
0
                    ps_interp_prms->i4_blk_wd,
925
0
                    ps_interp_prms->i4_blk_ht,
926
0
                    ps_interp_prms->i4_blk_wd,
927
0
                    ps_interp_prms->i4_blk_ht,
928
0
                    0,
929
0
                    1);
930
0
            }
931
932
23.5k
            s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
933
23.5k
            s_err_prms.i4_inp_stride = inp_stride;
934
23.5k
            s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
935
23.5k
            s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
936
23.5k
            s_err_prms.i4_grid_mask = 1;
937
23.5k
            s_err_prms.pi4_sad_grid = &i4_sad_grid;
938
23.5k
            s_err_prms.i4_blk_wd = wd;
939
23.5k
            s_err_prms.i4_blk_ht = ht;
940
23.5k
            s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
941
23.5k
            s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
942
943
23.5k
            if(ps_inter_ctb_prms->u1_use_satd)
944
19.3k
            {
945
19.3k
                pf_err_compute = compute_satd_8bit;
946
19.3k
            }
947
4.21k
            else
948
4.21k
            {
949
4.21k
                pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
950
4.21k
            }
951
952
23.5k
            pf_err_compute(&s_err_prms);
953
954
23.5k
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
955
23.5k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
956
0
            {
957
0
                unsigned long u4_shift_val;
958
0
                ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
959
0
                ULWORD64 u8_temp_var, u8_temp_var1;
960
0
                S32 i4_bits_req;
961
962
0
                S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
963
964
0
                u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
965
0
                u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
966
967
0
                if(e_cu_size == CU_8x8)
968
0
                {
969
0
                    PART_ID_T e_part_id =
970
0
                        (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
971
972
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
973
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
974
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
975
0
                        &u8_src_variance,
976
0
                        i4_default_src_wt,
977
0
                        0,
978
0
                        ps_inter_ctb_prms->wpred_log_wdc,
979
0
                        e_part_id);
980
0
                }
981
0
                else
982
0
                {
983
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
984
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
985
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
986
0
                        &u8_src_variance,
987
0
                        i4_default_src_wt,
988
0
                        0,
989
0
                        ps_inter_ctb_prms->wpred_log_wdc,
990
0
                        e_part_id);
991
0
                }
992
993
0
                u8_pred_variance = u8_pred_variance >> u4_shift_val;
994
995
0
                GETRANGE64(i4_bits_req, u8_pred_variance);
996
997
0
                if(i4_bits_req > 27)
998
0
                {
999
0
                    u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1000
0
                    u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1001
0
                }
1002
1003
0
                if(u8_src_variance == u8_pred_variance)
1004
0
                {
1005
0
                    u8_temp_var = (1 << STIM_Q_FORMAT);
1006
0
                }
1007
0
                else
1008
0
                {
1009
0
                    u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1010
0
                    u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1011
0
                    u8_temp_var1 =
1012
0
                        (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1013
0
                    u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1014
0
                    u8_temp_var = (u8_temp_var / u8_temp_var1);
1015
0
                }
1016
1017
0
                i4_noise_term = (UWORD32)u8_temp_var;
1018
1019
0
                i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1020
1021
0
                ASSERT(i4_noise_term >= 0);
1022
1023
0
                u8_temp_var = i4_sad_grid;
1024
0
                u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1025
0
                u8_temp_var += (1 << ((i4_q_level)-1));
1026
0
                i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
1027
0
            }
1028
23.5k
#endif
1029
1030
23.5k
            cur_iter_best_cost = i4_sad_grid;
1031
23.5k
            cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
1032
23.5k
            cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
1033
1034
23.5k
            if(cur_iter_best_cost < bi_cost)
1035
23.5k
            {
1036
23.5k
                bi_cost = cur_iter_best_cost;
1037
23.5k
                best_cand_in_opp_dir_idx = i;
1038
23.5k
                is_best_cand_an_intra = 0;
1039
23.5k
            }
1040
23.5k
        }
1041
1042
498k
        uni_cost = ps_pu_node1->i4_tot_cost;
1043
1044
498k
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
1045
498k
        if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1046
0
        {
1047
0
            unsigned long u4_shift_val;
1048
0
            ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
1049
0
            ULWORD64 u8_temp_var, u8_temp_var1;
1050
0
            S32 i4_bits_req;
1051
1052
0
            S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
1053
1054
0
            S08 i1_ref_idx =
1055
0
                (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1056
0
                    ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
1057
0
                    : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
1058
0
            S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
1059
1060
0
            u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
1061
0
            u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
1062
1063
0
            if(e_cu_size == CU_8x8)
1064
0
            {
1065
0
                PART_ID_T e_part_id =
1066
0
                    (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
1067
1068
0
                u4_shift_val = ihevce_calc_stim_injected_variance(
1069
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaX,
1070
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1071
0
                    &u8_src_variance,
1072
0
                    ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1073
0
                    ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1074
0
                    ps_inter_ctb_prms->wpred_log_wdc,
1075
0
                    e_part_id);
1076
0
            }
1077
0
            else
1078
0
            {
1079
0
                u4_shift_val = ihevce_calc_stim_injected_variance(
1080
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaX,
1081
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1082
0
                    &u8_src_variance,
1083
0
                    ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1084
0
                    ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1085
0
                    ps_inter_ctb_prms->wpred_log_wdc,
1086
0
                    e_part_id);
1087
0
            }
1088
1089
0
            u8_pred_variance = u8_pred_variance >> (u4_shift_val);
1090
1091
0
            GETRANGE64(i4_bits_req, u8_pred_variance);
1092
1093
0
            if(i4_bits_req > 27)
1094
0
            {
1095
0
                u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1096
0
                u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1097
0
            }
1098
1099
0
            if(u8_src_variance == u8_pred_variance)
1100
0
            {
1101
0
                u8_temp_var = (1 << STIM_Q_FORMAT);
1102
0
            }
1103
0
            else
1104
0
            {
1105
0
                u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1106
0
                u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1107
0
                u8_temp_var1 =
1108
0
                    (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1109
0
                u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1110
0
                u8_temp_var = (u8_temp_var / u8_temp_var1);
1111
0
            }
1112
1113
0
            i4_noise_term = (UWORD32)u8_temp_var;
1114
1115
0
            i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1116
1117
0
            ASSERT(i4_noise_term >= 0);
1118
1119
0
            u8_temp_var = i4_sad;
1120
0
            u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1121
0
            u8_temp_var += (1 << ((i4_q_level)-1));
1122
0
            i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
1123
1124
0
            uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
1125
1126
0
            pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
1127
0
            pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
1128
0
        }
1129
498k
#endif
1130
1131
498k
        if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
1132
4.07k
        {
1133
4.07k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1134
0
            {
1135
0
                pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
1136
0
                pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
1137
0
            }
1138
1139
4.07k
            if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1140
1.95k
            {
1141
1.95k
                ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1142
1143
1.95k
                if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1144
0
                {
1145
0
                    ps_pu_node1->pu.mv.i1_l1_ref_idx =
1146
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1147
0
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1148
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1149
0
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1150
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1151
0
                }
1152
1.95k
                else
1153
1.95k
                {
1154
1.95k
                    ps_pu_node1->pu.mv.i1_l1_ref_idx =
1155
1.95k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1156
1.95k
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1157
1.95k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1158
1.95k
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1159
1.95k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1160
1.95k
                }
1161
1.95k
            }
1162
2.11k
            else
1163
2.11k
            {
1164
2.11k
                ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1165
1166
2.11k
                if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1167
2.11k
                {
1168
2.11k
                    ps_pu_node1->pu.mv.i1_l0_ref_idx =
1169
2.11k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1170
2.11k
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1171
2.11k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1172
2.11k
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1173
2.11k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1174
2.11k
                }
1175
0
                else
1176
0
                {
1177
0
                    ps_pu_node1->pu.mv.i1_l0_ref_idx =
1178
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1179
0
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1180
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1181
0
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1182
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1183
0
                }
1184
2.11k
            }
1185
1186
4.07k
            ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
1187
4.07k
        }
1188
1189
498k
        best_cost = MIN(uni_cost, bi_cost);
1190
498k
        tot_cost += best_cost;
1191
498k
    }
1192
1193
397k
    hme_debrief_bipred_eval(
1194
397k
        ps_part_type_result,
1195
397k
        as_pred_buf_data,
1196
397k
        &ps_inter_ctb_prms->s_pred_buf_mngr,
1197
397k
        au1_pred_buf_array_indixes,
1198
397k
        ps_cmn_utils_optimised_function_list);
1199
1200
397k
    ps_part_type_result->i4_tot_cost = tot_cost;
1201
397k
}
1202
1203
WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
1204
    err_prms_t *ps_prms,
1205
    WORD32 lambda,
1206
    WORD32 lambda_q_shift,
1207
    WORD32 i4_frm_qstep,
1208
    me_func_selector_t *ps_func_selector)
1209
187k
{
1210
187k
    S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
1211
187k
    S32 i4_satd_8x8;
1212
187k
    S16 *pi2_had_out;
1213
187k
    S32 i4_tu_split_flag = 0;
1214
187k
    S32 i4_tu_early_cbf = 0;
1215
1216
187k
    S32 i4_early_cbf = 1;
1217
    //  S32 i4_i, i4_k;
1218
187k
    S32 i4_total_satd_cost = 0;
1219
187k
    S32 best_cost_tu_split;
1220
1221
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1222
187k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1223
187k
    S32 *api4_tu_split[HAD_32x32 + 1];
1224
187k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1225
1226
187k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1227
187k
    S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
1228
187k
    S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
1229
1230
187k
    U08 *pu1_inp = ps_prms->pu1_inp;
1231
187k
    U08 *pu1_ref = ps_prms->pu1_ref;
1232
1233
187k
    S32 inp_stride = ps_prms->i4_inp_stride;
1234
187k
    S32 ref_stride = ps_prms->i4_ref_stride;
1235
1236
    /* Initialize tu_split_cost to "0" */
1237
187k
    ps_prms->i4_tu_split_cost = 0;
1238
187k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1239
1240
187k
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1241
187k
    api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
1242
187k
    api4_satd_pu[HAD_16x16] = NULL;
1243
187k
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1244
1245
187k
    api4_tu_split[HAD_4x4] = NULL;
1246
187k
    api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
1247
187k
    api4_tu_split[HAD_16x16] = NULL;
1248
187k
    api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1249
1250
187k
    api4_tu_early_cbf[HAD_4x4] = NULL;
1251
187k
    api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
1252
187k
    api4_tu_early_cbf[HAD_16x16] = NULL;
1253
187k
    api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1254
1255
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1256
1257
    /* Return value is merge of both best_stad_cost and tu_split_flags */
1258
187k
    best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
1259
187k
        pu1_inp,
1260
187k
        inp_stride,
1261
187k
        pu1_ref,
1262
187k
        ref_stride,
1263
187k
        pi2_had_out,
1264
187k
        8,
1265
187k
        api4_satd_pu,
1266
187k
        api4_tu_split,
1267
187k
        api4_tu_early_cbf,
1268
187k
        0,
1269
187k
        2,
1270
187k
        0,
1271
187k
        0,
1272
187k
        i4_frm_qstep,
1273
187k
        0,
1274
187k
        ps_prms->u1_max_tr_depth,
1275
187k
        ps_prms->u1_max_tr_size,
1276
187k
        &(ps_prms->i4_tu_split_cost),
1277
187k
        NULL);
1278
1279
    /* For SATD computation following TU size are assumed for a 8x8 CU */
1280
    /* 8 for 2Nx2N, 4 for Nx2N,2NxN                                    */
1281
1282
187k
    i4_total_satd_cost = best_cost_tu_split >> 2;
1283
1284
    /* Second last bit has the tu pslit flag */
1285
187k
    i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
1286
1287
    /* Last bit corrsponds to the Early CBF flag */
1288
187k
    i4_early_cbf = (best_cost_tu_split & 0x1);
1289
1290
    /* Update 8x8 SATDs */
1291
187k
    pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
1292
187k
    pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
1293
187k
    pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
1294
1295
187k
    return i4_total_satd_cost;
1296
187k
}
1297
//#endif
1298
/**
1299
********************************************************************************
1300
*  @fn     S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
1301
*
1302
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1303
*          of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1304
*
1305
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1306
*                 pointer to sad grid of each partitions
1307
*
1308
*  @return     None
1309
********************************************************************************
1310
*/
1311
1312
void hme_evalsatd_update_2_best_results_pt_pu_16x16(
1313
    err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1314
0
{
1315
0
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1316
0
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1317
0
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1318
0
    S32 i;
1319
0
    S16 ai2_8x8_had[256];
1320
0
    S16 *pi2_y0;
1321
0
    U08 *pu1_src, *pu1_pred;
1322
0
    S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1323
0
    S32 *ppi4_hsad;
1324
1325
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1326
0
    S32 *api4_satd_pu[HAD_32x32 + 1];
1327
0
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1328
1329
0
    U08 *pu1_inp = ps_prms->pu1_inp;
1330
0
    U08 *pu1_ref = ps_prms->pu1_ref;
1331
1332
0
    S32 inp_stride = ps_prms->i4_inp_stride;
1333
0
    S32 ref_stride = ps_prms->i4_ref_stride;
1334
1335
0
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1336
0
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1337
0
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1338
0
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1339
1340
0
    ppi4_hsad = api4_satd_pu[HAD_16x16];
1341
1342
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1343
0
    for(i = 0; i < 4; i++)
1344
0
    {
1345
0
        pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1346
0
        pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1347
0
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1348
0
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1349
1350
0
        ihevce_had_8x8_using_4_4x4(
1351
0
            pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1352
0
    }
1353
1354
    /* For SATD computation following TU size are assumed for a 16x16 CU */
1355
    /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1356
1357
    /* Update 8x8 SATDs */
1358
    /* Modified to cost calculation using only 4x4 SATD */
1359
1360
    //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1361
    //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1362
    //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1363
    //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1364
1365
    /* Update 16x16 SATDs */
1366
0
    pi4_sad_grid[PART_ID_2Nx2N] =
1367
0
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1368
1369
0
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1370
0
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1371
0
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1372
0
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1373
1374
    /* Update 8x16 / 16x8 SATDs */
1375
0
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1376
0
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1377
0
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1378
0
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1379
1380
    /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1381
0
    pi4_sad_grid[PART_ID_nLx2N_L] =
1382
0
        ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
1383
1384
0
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
1385
0
                                    ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1386
1387
0
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
1388
0
                                    ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1389
1390
0
    pi4_sad_grid[PART_ID_nRx2N_R] =
1391
0
        ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
1392
1393
0
    pi4_sad_grid[PART_ID_2NxnU_T] =
1394
0
        ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
1395
1396
0
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
1397
0
                                    ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
1398
1399
0
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
1400
0
                                    ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
1401
1402
0
    pi4_sad_grid[PART_ID_2NxnD_B] =
1403
0
        ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1404
1405
    /* Call the update results function */
1406
0
    {
1407
0
        S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1408
0
        mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1409
0
        S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1410
0
        S32 best_node_cost;
1411
0
        S32 second_best_node_cost;
1412
1413
        /*For each valid partition, update the refine_prm structure to reflect the best and second
1414
        best candidates for that partition*/
1415
1416
0
        for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1417
0
        {
1418
0
            S32 update_required = 0;
1419
0
            S32 part_id = pi4_valid_part_ids[i4_count];
1420
0
            S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1421
1422
            /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1423
0
            i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1424
1425
            /*Calculate total cost*/
1426
0
            i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1427
0
            i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1428
1429
            /*****************************************************************/
1430
            /* We do not labor through the results if the total cost worse   */
1431
            /* than the last of the results.                                 */
1432
            /*****************************************************************/
1433
0
            best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1434
0
            second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
1435
1436
0
            if(i4_tot_cost < second_best_node_cost)
1437
0
            {
1438
0
                update_required = 2;
1439
1440
                /*************************************************************/
1441
                /* Identify where the current result isto be placed.Basically*/
1442
                /* find the node which has cost just higher thannodeundertest*/
1443
                /*************************************************************/
1444
0
                if(i4_tot_cost < best_node_cost)
1445
0
                {
1446
0
                    update_required = 1;
1447
0
                }
1448
0
                else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1449
0
                {
1450
0
                    update_required = 0;
1451
0
                }
1452
0
                if(update_required == 2)
1453
0
                {
1454
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1455
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1456
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1457
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1458
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1459
0
                }
1460
0
                else if(update_required == 1)
1461
0
                {
1462
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
1463
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index];
1464
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
1465
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1466
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] =
1467
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index];
1468
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] =
1469
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index];
1470
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
1471
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index];
1472
1473
0
                    ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1474
0
                    ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1475
0
                    ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1476
0
                    ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1477
0
                    ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1478
0
                }
1479
0
            }
1480
0
        }
1481
0
    }
1482
0
}
1483
1484
//#if COMPUTE_16x16_R == C
1485
void hme_evalsatd_update_1_best_result_pt_pu_16x16(
1486
    err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1487
2.47M
{
1488
2.47M
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1489
2.47M
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1490
2.47M
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1491
2.47M
    S32 i;
1492
2.47M
    S16 ai2_8x8_had[256];
1493
2.47M
    S16 *pi2_y0;
1494
2.47M
    U08 *pu1_src, *pu1_pred;
1495
2.47M
    S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1496
2.47M
    S32 *ppi4_hsad;
1497
1498
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1499
2.47M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1500
2.47M
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1501
1502
2.47M
    U08 *pu1_inp = ps_prms->pu1_inp;
1503
2.47M
    U08 *pu1_ref = ps_prms->pu1_ref;
1504
1505
2.47M
    S32 inp_stride = ps_prms->i4_inp_stride;
1506
2.47M
    S32 ref_stride = ps_prms->i4_ref_stride;
1507
1508
2.47M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1509
2.47M
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1510
2.47M
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1511
2.47M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1512
1513
2.47M
    ppi4_hsad = api4_satd_pu[HAD_16x16];
1514
1515
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1516
12.3M
    for(i = 0; i < 4; i++)
1517
9.91M
    {
1518
9.91M
        pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1519
9.91M
        pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1520
9.91M
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1521
9.91M
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1522
1523
9.91M
        ihevce_had_8x8_using_4_4x4(
1524
9.91M
            pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1525
9.91M
    }
1526
1527
    /* For SATD computation following TU size are assumed for a 16x16 CU */
1528
    /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1529
1530
    /* Update 8x8 SATDs */
1531
    /* Modified to cost calculation using only 4x4 SATD */
1532
1533
    //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1534
    //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1535
    //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1536
    //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1537
1538
    /* Update 16x16 SATDs */
1539
2.47M
    pi4_sad_grid[PART_ID_2Nx2N] =
1540
2.47M
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1541
1542
2.47M
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1543
2.47M
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1544
2.47M
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1545
2.47M
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1546
1547
    /* Update 8x16 / 16x8 SATDs */
1548
2.47M
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1549
2.47M
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1550
2.47M
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1551
2.47M
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1552
1553
    /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1554
2.47M
    pi4_sad_grid[PART_ID_nLx2N_L] =
1555
2.47M
        ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
1556
2.47M
    pi4_sad_grid[PART_ID_nRx2N_R] =
1557
2.47M
        ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
1558
2.47M
    pi4_sad_grid[PART_ID_2NxnU_T] =
1559
2.47M
        ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1560
2.47M
    pi4_sad_grid[PART_ID_2NxnD_B] =
1561
2.47M
        ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1562
1563
2.47M
    pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
1564
2.47M
    pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
1565
2.47M
    pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
1566
2.47M
    pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
1567
1568
    /* Call the update results function */
1569
2.47M
    {
1570
2.47M
        S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1571
2.47M
        mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1572
2.47M
        S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1573
2.47M
        S32 best_node_cost;
1574
2.47M
        S32 second_best_node_cost;
1575
1576
        /*For each valid partition, update the refine_prm structure to reflect the best and second
1577
        best candidates for that partition*/
1578
1579
34.5M
        for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1580
32.0M
        {
1581
32.0M
            S32 update_required = 0;
1582
32.0M
            S32 part_id = pi4_valid_part_ids[i4_count];
1583
32.0M
            S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1584
1585
            /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1586
32.0M
            i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1587
1588
            /*Calculate total cost*/
1589
32.0M
            i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1590
32.0M
            i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1591
1592
            /*****************************************************************/
1593
            /* We do not labor through the results if the total cost worse   */
1594
            /* than the last of the results.                                 */
1595
            /*****************************************************************/
1596
32.0M
            best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1597
32.0M
            second_best_node_cost = SHRT_MAX;
1598
1599
32.0M
            if(i4_tot_cost < second_best_node_cost)
1600
32.0M
            {
1601
32.0M
                update_required = 0;
1602
1603
                /*************************************************************/
1604
                /* Identify where the current result isto be placed.Basically*/
1605
                /* find the node which has cost just higher thannodeundertest*/
1606
                /*************************************************************/
1607
32.0M
                if(i4_tot_cost < best_node_cost)
1608
1.60M
                {
1609
1.60M
                    update_required = 1;
1610
1.60M
                }
1611
30.4M
                else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1612
7.32M
                {
1613
7.32M
                    update_required = 0;
1614
7.32M
                }
1615
32.0M
                if(update_required == 2)
1616
0
                {
1617
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1618
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1619
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1620
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1621
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1622
0
                }
1623
32.0M
                else if(update_required == 1)
1624
1.60M
                {
1625
1.60M
                    ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1626
1.60M
                    ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1627
1.60M
                    ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1628
1.60M
                    ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1629
1.60M
                    ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1630
1.60M
                }
1631
32.0M
            }
1632
32.0M
        }
1633
2.47M
    }
1634
2.47M
}
1635
1636
WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
1637
    err_prms_t *ps_prms,
1638
    WORD32 lambda,
1639
    WORD32 lambda_q_shift,
1640
    WORD32 i4_frm_qstep,
1641
    me_func_selector_t *ps_func_selector)
1642
169k
{
1643
169k
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1644
169k
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1645
169k
    S32 ai4_tu_split_8x8[16];
1646
169k
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1647
1648
169k
    S32 ai4_tu_early_cbf_8x8[16];
1649
1650
    //S16 ai2_had_out[256];
1651
169k
    S16 *pi2_had_out;
1652
169k
    S32 tu_split_flag = 0;
1653
169k
    S32 early_cbf_flag = 0;
1654
169k
    S32 total_satd_cost = 0;
1655
1656
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1657
169k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1658
169k
    S32 *api4_tu_split[HAD_32x32 + 1];
1659
169k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1660
1661
169k
    U08 *pu1_inp = ps_prms->pu1_inp;
1662
169k
    U08 *pu1_ref = ps_prms->pu1_ref;
1663
1664
169k
    S32 inp_stride = ps_prms->i4_inp_stride;
1665
169k
    S32 ref_stride = ps_prms->i4_ref_stride;
1666
1667
    /* Initialize tu_split_cost to "0" */
1668
169k
    ps_prms->i4_tu_split_cost = 0;
1669
1670
169k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1671
1672
169k
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1673
169k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1674
169k
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1675
169k
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1676
1677
169k
    api4_tu_split[HAD_4x4] = NULL;
1678
169k
    api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1679
169k
    api4_tu_split[HAD_16x16] = &tu_split_flag;
1680
169k
    api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1681
1682
169k
    api4_tu_early_cbf[HAD_4x4] = NULL;
1683
169k
    api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1684
169k
    api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
1685
169k
    api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1686
1687
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1688
169k
    ps_func_selector->pf_had_16x16_r(
1689
169k
        pu1_inp,
1690
169k
        inp_stride,
1691
169k
        pu1_ref,
1692
169k
        ref_stride,
1693
169k
        pi2_had_out,
1694
169k
        16,
1695
169k
        api4_satd_pu,
1696
169k
        api4_tu_split,
1697
169k
        api4_tu_early_cbf,
1698
169k
        0,
1699
169k
        4,
1700
169k
        lambda,
1701
169k
        lambda_q_shift,
1702
169k
        i4_frm_qstep,
1703
169k
        0,
1704
169k
        ps_prms->u1_max_tr_depth,
1705
169k
        ps_prms->u1_max_tr_size,
1706
169k
        &(ps_prms->i4_tu_split_cost),
1707
169k
        NULL);
1708
1709
169k
    total_satd_cost = i4_satd_16x16;
1710
1711
169k
    ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
1712
1713
169k
    ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
1714
1715
169k
    return total_satd_cost;
1716
169k
}
1717
1718
/**
1719
********************************************************************************
1720
*  @fn     S32 hme_evalsatd_pt_pu_32x32
1721
*
1722
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1723
*          of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1724
*
1725
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1726
*                 pointer to sad grid of each partitions
1727
*
1728
*  @return     None
1729
********************************************************************************
1730
*/
1731
void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
1732
26.6k
{
1733
    //S32 ai4_satd_4x4[64];   /* num 4x4s in a 32x32 */
1734
26.6k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1735
26.6k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1736
26.6k
    S32 i4_satd_32x32;
1737
    //    S16 ai2_had_out[32*32];
1738
26.6k
    U08 *pu1_src;
1739
26.6k
    U08 *pu1_pred;
1740
26.6k
    S32 i;
1741
1742
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1743
26.6k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1744
26.6k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1745
1746
26.6k
    U08 *pu1_inp = ps_prms->pu1_inp;
1747
26.6k
    U08 *pu1_ref = ps_prms->pu1_ref;
1748
1749
26.6k
    S32 inp_stride = ps_prms->i4_inp_stride;
1750
26.6k
    S32 ref_stride = ps_prms->i4_ref_stride;
1751
1752
    //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[0];
1753
26.6k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1754
26.6k
    api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1755
26.6k
    api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1756
1757
    /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
1758
452k
    for(i = 0; i < 16; i++)
1759
425k
    {
1760
425k
        pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
1761
1762
425k
        pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
1763
1764
425k
        ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1765
425k
            pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1766
425k
    }
1767
1768
    /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1769
26.6k
    ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
1770
26.6k
    ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
1771
26.6k
    ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
1772
26.6k
    ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1773
1774
    /* Update 32x32 SATD */
1775
26.6k
    pi4_sad_grid[PART_ID_2Nx2N] =
1776
26.6k
        ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
1777
1778
    /* Update 16x16 SATDs */
1779
26.6k
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
1780
26.6k
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
1781
26.6k
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
1782
26.6k
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
1783
1784
    /* Update 16x32 / 32x16 SATDs */
1785
26.6k
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
1786
26.6k
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
1787
26.6k
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
1788
26.6k
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
1789
1790
    /* Update AMP SATDs 32x24,32x8, 24x32,8x32  */
1791
26.6k
    pi4_sad_grid[PART_ID_nLx2N_L] =
1792
26.6k
        ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
1793
1794
26.6k
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
1795
26.6k
                                    ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1796
1797
26.6k
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
1798
26.6k
                                    ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1799
1800
26.6k
    pi4_sad_grid[PART_ID_nRx2N_R] =
1801
26.6k
        ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
1802
1803
26.6k
    pi4_sad_grid[PART_ID_2NxnU_T] =
1804
26.6k
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1805
1806
26.6k
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
1807
26.6k
                                    ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
1808
1809
26.6k
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
1810
26.6k
                                    ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
1811
1812
26.6k
    pi4_sad_grid[PART_ID_2NxnD_B] =
1813
26.6k
        ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1814
26.6k
}
1815
1816
WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
1817
    err_prms_t *ps_prms,
1818
    WORD32 lambda,
1819
    WORD32 lambda_q_shift,
1820
    WORD32 i4_frm_qstep,
1821
    me_func_selector_t *ps_func_selector)
1822
36.9k
{
1823
36.9k
    S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1824
36.9k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1825
36.9k
    S32 ai4_tu_split_8x8[16];
1826
36.9k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1827
36.9k
    S32 ai4_tu_split_16x16[4];
1828
36.9k
    S32 i4_satd_32x32;
1829
1830
36.9k
    S32 ai4_tu_early_cbf_8x8[16];
1831
36.9k
    S32 ai4_tu_early_cbf_16x16[4];
1832
36.9k
    S32 early_cbf_flag;
1833
1834
36.9k
    S16 *pi2_had_out;
1835
1836
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1837
36.9k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1838
36.9k
    S32 *api4_tu_split[HAD_32x32 + 1];
1839
36.9k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1840
1841
36.9k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1842
36.9k
    S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
1843
36.9k
    S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
1844
1845
36.9k
    S32 tu_split_flag = 0;
1846
36.9k
    S32 total_satd_cost = 0;
1847
1848
36.9k
    U08 *pu1_inp = ps_prms->pu1_inp;
1849
36.9k
    U08 *pu1_ref = ps_prms->pu1_ref;
1850
1851
36.9k
    S32 inp_stride = ps_prms->i4_inp_stride;
1852
36.9k
    S32 ref_stride = ps_prms->i4_ref_stride;
1853
1854
    /* Initialize tu_split_cost to "0" */
1855
36.9k
    ps_prms->i4_tu_split_cost = 0;
1856
1857
36.9k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1858
1859
36.9k
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1860
36.9k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1861
36.9k
    api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1862
36.9k
    api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1863
1864
36.9k
    api4_tu_split[HAD_4x4] = NULL;
1865
36.9k
    api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1866
36.9k
    api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
1867
36.9k
    api4_tu_split[HAD_32x32] = &tu_split_flag;
1868
1869
36.9k
    api4_tu_early_cbf[HAD_4x4] = NULL;
1870
36.9k
    api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1871
36.9k
    api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
1872
36.9k
    api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
1873
1874
    /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
1875
36.9k
    ihevce_had_32x32_r(
1876
36.9k
        pu1_inp,
1877
36.9k
        inp_stride,
1878
36.9k
        pu1_ref,
1879
36.9k
        ref_stride,
1880
36.9k
        pi2_had_out,
1881
36.9k
        32,
1882
36.9k
        api4_satd_pu,
1883
36.9k
        api4_tu_split,
1884
36.9k
        api4_tu_early_cbf,
1885
36.9k
        0,
1886
36.9k
        8,
1887
36.9k
        lambda,
1888
36.9k
        lambda_q_shift,
1889
36.9k
        i4_frm_qstep,
1890
36.9k
        0,
1891
36.9k
        ps_prms->u1_max_tr_depth,
1892
36.9k
        ps_prms->u1_max_tr_size,
1893
36.9k
        &(ps_prms->i4_tu_split_cost),
1894
36.9k
        ps_func_selector);
1895
1896
36.9k
    total_satd_cost = i4_satd_32x32;
1897
1898
    /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
1899
    TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1900
    TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1901
    BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1902
    BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1903
    32x32_split - 1bit (LSBit)
1904
1905
    TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
1906
1907
36.9k
    pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
1908
36.9k
    pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
1909
36.9k
    pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
1910
1911
36.9k
    return total_satd_cost;
1912
36.9k
}
1913
1914
/**
1915
********************************************************************************
1916
*  @fn     S32 hme_evalsatd_pt_pu_64x64
1917
*
1918
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1919
*          of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
1920
*
1921
*           Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
1922
*                  outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
1923
*                  TU size of 64 is not supported in HEVC
1924
*
1925
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1926
*                 pointer to sad grid of each partitions
1927
*
1928
*  @return     None
1929
********************************************************************************
1930
*/
1931
1932
void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
1933
2.42k
{
1934
    //S32 ai4_satd_4x4[4][64];   /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
1935
2.42k
    S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
1936
2.42k
    S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
1937
2.42k
    S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
1938
    //    S16 ai2_had_out[32*32];
1939
2.42k
    S32 i, j;
1940
1941
    //  S32 ai4_tu_split_8x8[4][16];
1942
    //  S32 ai4_tu_split_16x16[4][4];
1943
    //  S32 ai4_tu_split_32x32[4];
1944
1945
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1946
2.42k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1947
    //  S32 *api4_tu_split[HAD_32x32 + 1];
1948
1949
2.42k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1950
1951
2.42k
    U08 *pu1_inp = ps_prms->pu1_inp;
1952
2.42k
    U08 *pu1_ref = ps_prms->pu1_ref;
1953
2.42k
    U08 *pu1_src;
1954
2.42k
    U08 *pu1_pred;
1955
1956
2.42k
    S32 inp_stride = ps_prms->i4_inp_stride;
1957
2.42k
    S32 ref_stride = ps_prms->i4_ref_stride;
1958
1959
12.1k
    for(i = 0; i < 4; i++)
1960
9.70k
    {
1961
9.70k
        S32 blkx = (i & 0x1);
1962
9.70k
        S32 blky = (i >> 1);
1963
9.70k
        U08 *pu1_pi0, *pu1_pi1;
1964
1965
        //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[i][0];
1966
9.70k
        api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
1967
9.70k
        api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
1968
9.70k
        api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
1969
1970
9.70k
        pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
1971
9.70k
        pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
1972
1973
        /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
1974
164k
        for(j = 0; j < 16; j++)
1975
155k
        {
1976
155k
            pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
1977
1978
155k
            pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
1979
1980
155k
            ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1981
155k
                pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1982
155k
        }
1983
1984
        /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1985
9.70k
        ai4_satd_16x16[i][0] =
1986
9.70k
            ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
1987
9.70k
        ai4_satd_16x16[i][1] =
1988
9.70k
            ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
1989
9.70k
        ai4_satd_16x16[i][2] =
1990
9.70k
            ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
1991
9.70k
        ai4_satd_16x16[i][3] =
1992
9.70k
            ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
1993
9.70k
    }
1994
1995
    /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1996
1997
2.42k
    ai4_satd_32x32[0] =
1998
2.42k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
1999
2.42k
    ai4_satd_32x32[1] =
2000
2.42k
        ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
2001
2.42k
    ai4_satd_32x32[2] =
2002
2.42k
        ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
2003
2.42k
    ai4_satd_32x32[3] =
2004
2.42k
        ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2005
2006
    /* Update 64x64 SATDs */
2007
2.42k
    pi4_sad_grid[PART_ID_2Nx2N] =
2008
2.42k
        ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2009
2010
    /* Update 32x32 SATDs */
2011
2.42k
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
2012
2.42k
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
2013
2.42k
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
2014
2.42k
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
2015
2016
    /* Update 32x64 / 64x32 SATDs */
2017
2.42k
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
2018
2.42k
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
2019
2.42k
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
2020
2.42k
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
2021
2022
    /* Update AMP SATDs 64x48,64x16, 48x64,16x64  */
2023
2.42k
    pi4_sad_grid[PART_ID_nLx2N_L] =
2024
2.42k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
2025
2026
2.42k
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
2027
2.42k
                                    ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
2028
2.42k
                                    pi4_sad_grid[PART_ID_Nx2N_R];
2029
2030
2.42k
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
2031
2.42k
                                    ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
2032
2.42k
                                    pi4_sad_grid[PART_ID_Nx2N_L];
2033
2034
2.42k
    pi4_sad_grid[PART_ID_nRx2N_R] =
2035
2.42k
        ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
2036
2037
2.42k
    pi4_sad_grid[PART_ID_2NxnU_T] =
2038
2.42k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
2039
2040
2.42k
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
2041
2.42k
                                    ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
2042
2.42k
                                    pi4_sad_grid[PART_ID_2NxN_B];
2043
2044
2.42k
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
2045
2.42k
                                    ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
2046
2.42k
                                    pi4_sad_grid[PART_ID_2NxN_T];
2047
2048
2.42k
    pi4_sad_grid[PART_ID_2NxnD_B] =
2049
2.42k
        ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2050
2.42k
}
2051
2052
WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
2053
    err_prms_t *ps_prms,
2054
    WORD32 lambda,
2055
    WORD32 lambda_q_shift,
2056
    WORD32 i4_frm_qstep,
2057
    me_func_selector_t *ps_func_selector)
2058
2.72k
{
2059
2.72k
    S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
2060
2.72k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
2061
2.72k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
2062
2.72k
    S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
2063
2064
2.72k
    S32 ai4_tu_split_8x8[16];
2065
2.72k
    S32 ai4_tu_split_16x16[4];
2066
2067
2.72k
    S32 ai4_tu_early_cbf_8x8[16];
2068
2.72k
    S32 ai4_tu_early_cbf_16x16[4];
2069
2070
2.72k
    S16 *pi2_had_out;
2071
2.72k
    S32 i;
2072
2073
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
2074
2.72k
    S32 *api4_satd_pu[HAD_32x32 + 1];
2075
2.72k
    S32 *api4_tu_split[HAD_32x32 + 1];
2076
2.72k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
2077
2078
2.72k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
2079
2080
2.72k
    S32 tu_split_flag = 0;
2081
2.72k
    S32 total_satd_cost = 0;
2082
2083
2.72k
    U08 *pu1_inp = ps_prms->pu1_inp;
2084
2.72k
    U08 *pu1_ref = ps_prms->pu1_ref;
2085
2086
2.72k
    S32 inp_stride = ps_prms->i4_inp_stride;
2087
2.72k
    S32 ref_stride = ps_prms->i4_ref_stride;
2088
2089
    /* Initialize tu_split_cost to "0" */
2090
2.72k
    ps_prms->i4_tu_split_cost = 0;
2091
2092
2.72k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
2093
2094
13.6k
    for(i = 0; i < 4; i++)
2095
10.8k
    {
2096
10.8k
        S32 blkx = (i & 0x1);
2097
10.8k
        S32 blky = (i >> 1);
2098
10.8k
        U08 *pu1_pi0, *pu1_pi1;
2099
10.8k
        tu_split_flag = 0;
2100
2101
10.8k
        api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
2102
10.8k
        api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
2103
10.8k
        api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
2104
10.8k
        api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
2105
2106
10.8k
        api4_tu_split[HAD_4x4] = NULL;
2107
10.8k
        api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
2108
10.8k
        api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
2109
10.8k
        api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
2110
2111
10.8k
        api4_tu_early_cbf[HAD_4x4] = NULL;
2112
10.8k
        api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
2113
10.8k
        api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
2114
10.8k
        api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
2115
2116
10.8k
        pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
2117
10.8k
        pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
2118
2119
        /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
2120
10.8k
        ihevce_had_32x32_r(
2121
10.8k
            pu1_pi0,
2122
10.8k
            inp_stride,
2123
10.8k
            pu1_pi1,
2124
10.8k
            ref_stride,
2125
10.8k
            pi2_had_out,
2126
10.8k
            32,
2127
10.8k
            api4_satd_pu,
2128
10.8k
            api4_tu_split,
2129
10.8k
            api4_tu_early_cbf,
2130
10.8k
            0,
2131
10.8k
            8,
2132
10.8k
            lambda,
2133
10.8k
            lambda_q_shift,
2134
10.8k
            i4_frm_qstep,
2135
10.8k
            1,
2136
10.8k
            ps_prms->u1_max_tr_depth,
2137
10.8k
            ps_prms->u1_max_tr_size,
2138
10.8k
            &(ps_prms->i4_tu_split_cost),
2139
10.8k
            ps_func_selector);
2140
10.8k
    }
2141
2142
2.72k
    total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2143
2144
    /* Update 64x64 SATDs */
2145
2.72k
    pi4_sad_grid[PART_ID_2Nx2N] =
2146
2.72k
        ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2147
2148
2.72k
    return total_satd_cost;
2149
2.72k
}
2150
2151
/**
2152
********************************************************************************
2153
*  @fn     void hme_subpel_refine_search_node(search_node_t *ps_search_node,
2154
*                                   hme_subpel_prms_t *ps_prms,
2155
*                                   layer_ctxt_t *ps_curr_layer,
2156
*                                   BLK_SIZE_T e_blk_size,
2157
*                                   S32 x_off,
2158
*                                   S32 y_off)
2159
*
2160
*  @brief  Refines a given partition within a CU
2161
*
2162
*  @param[in,out]  ps_search_node: supplies starting mv and also ref id.
2163
*                   updated with the accurate subpel mv
2164
*
2165
*  @param[in]  ps_prms: subpel prms input to this function
2166
*
2167
*  @param[in]  ps_curr_layer : layer context
2168
*
2169
*  @param[in]  e_blk_size : Block size enumeration
2170
*
2171
*  @param[in]  x_off : x offset of the partition w.r.t. pic start
2172
*
2173
*  @param[in]  y_off : y offset of the partition w.r.t. pic start
2174
*
2175
*  @return None
2176
********************************************************************************
2177
*/
2178
2179
static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
2180
    me_func_selector_t *ps_func_selector,
2181
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
2182
    S32 i4_part_mask,
2183
    U08 u1_use_satd,
2184
    U08 u1_num_parts,
2185
    U08 u1_num_results)
2186
275k
{
2187
275k
    PF_SAD_RESULT_FXN_T pf_err_compute;
2188
2189
275k
    ASSERT((1 == u1_num_results) || (2 == u1_num_results));
2190
2191
275k
    if(1 == u1_num_results)
2192
275k
    {
2193
275k
        if(u1_use_satd)
2194
207k
        {
2195
207k
            if(u1_num_parts == 1)
2196
19.0k
            {
2197
19.0k
                pf_err_compute =
2198
19.0k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
2199
19.0k
            }
2200
188k
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2201
32.8k
            {
2202
32.8k
                pf_err_compute =
2203
32.8k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
2204
32.8k
            }
2205
155k
            else
2206
155k
            {
2207
155k
                pf_err_compute =
2208
155k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
2209
155k
            }
2210
207k
        }
2211
67.9k
        else
2212
67.9k
        {
2213
67.9k
            if(u1_num_parts == 1)
2214
47.0k
            {
2215
47.0k
                pf_err_compute = ps_me_optimised_function_list
2216
47.0k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
2217
47.0k
            }
2218
20.8k
            else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2219
4.97k
            {
2220
4.97k
                pf_err_compute =
2221
4.97k
                    ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
2222
4.97k
            }
2223
15.8k
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2224
7.73k
            {
2225
7.73k
                pf_err_compute = ps_me_optimised_function_list
2226
7.73k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
2227
7.73k
            }
2228
8.16k
            else
2229
8.16k
            {
2230
8.16k
                pf_err_compute = ps_me_optimised_function_list
2231
8.16k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
2232
8.16k
            }
2233
67.9k
        }
2234
275k
    }
2235
0
    else
2236
0
    {
2237
0
        if(u1_use_satd)
2238
0
        {
2239
0
            if(u1_num_parts == 1)
2240
0
            {
2241
0
                pf_err_compute =
2242
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
2243
0
            }
2244
0
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2245
0
            {
2246
0
                pf_err_compute =
2247
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
2248
0
            }
2249
0
            else
2250
0
            {
2251
0
                pf_err_compute =
2252
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
2253
0
            }
2254
0
        }
2255
0
        else
2256
0
        {
2257
0
            if(u1_num_parts == 1)
2258
0
            {
2259
0
                pf_err_compute = ps_me_optimised_function_list
2260
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
2261
0
            }
2262
0
            else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2263
0
            {
2264
0
                pf_err_compute = ps_me_optimised_function_list
2265
0
                                     ->pf_calc_sad_and_2_best_results_subpel_square_parts;
2266
0
            }
2267
0
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2268
0
            {
2269
0
                pf_err_compute = ps_me_optimised_function_list
2270
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
2271
0
            }
2272
0
            else
2273
0
            {
2274
0
                pf_err_compute = ps_me_optimised_function_list
2275
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
2276
0
            }
2277
0
        }
2278
0
    }
2279
2280
275k
    return pf_err_compute;
2281
275k
}
2282
2283
#if DIAMOND_GRID == 1
2284
S32 hme_subpel_refine_search_node_high_speed(
2285
    search_node_t *ps_search_node,
2286
    hme_subpel_prms_t *ps_prms,
2287
    layer_ctxt_t *ps_curr_layer,
2288
    BLK_SIZE_T e_blk_size,
2289
    S32 x_off,
2290
    S32 y_off,
2291
    search_results_t *ps_search_results,
2292
    S32 pred_lx,
2293
    S32 i4_part_mask,
2294
    S32 *pi4_valid_part_ids,
2295
    S32 search_idx,
2296
    subpel_dedup_enabler_t *ps_dedup_enabler,
2297
    me_func_selector_t *ps_func_selector,
2298
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
2299
275k
{
2300
275k
    S32 i4_num_hpel_refine, i4_num_qpel_refine;
2301
275k
    S32 i4_offset, i4_grid_mask;
2302
275k
    S08 i1_ref_idx;
2303
275k
    S32 i4_blk_wd, i4_blk_ht;
2304
275k
    S32 i4_ref_stride, i4_i;
2305
275k
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2306
275k
    result_upd_prms_t s_result_prms;
2307
275k
    search_node_t s_temp_search_node;
2308
2309
    /*************************************************************************/
2310
    /* Tracks current MV with the fractional component.                      */
2311
    /*************************************************************************/
2312
275k
    S32 i4_mv_x, i4_mv_y;
2313
275k
    S32 i4_frac_x, i4_frac_y;
2314
2315
    /*************************************************************************/
2316
    /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2317
    /* This function                                                         */
2318
    /*************************************************************************/
2319
275k
    PF_SAD_RESULT_FXN_T pf_err_compute;
2320
2321
275k
    S32 ai4_sad_grid[17], i4_tot_cost;
2322
275k
    err_prms_t s_err_prms;
2323
2324
    /*************************************************************************/
2325
    /* Allowed MV RANGE                                                      */
2326
    /*************************************************************************/
2327
275k
    range_prms_t *ps_range_prms;
2328
2329
    /*************************************************************************/
2330
    /* stores min id in grid with associated min cost.                       */
2331
    /*************************************************************************/
2332
275k
    S32 i4_min_cost, i4_min_sad;
2333
275k
    GRID_PT_T e_min_id;
2334
2335
275k
    PF_INTERP_FXN_T pf_qpel_interp;
2336
    /*************************************************************************/
2337
    /* For hpel and qpel we move in diamonds and hence each point in the     */
2338
    /* diamond will belong to a completely different plane. To simplify the  */
2339
    /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2340
    /* hpel planes which are interpolated during recon.                      */
2341
    /*************************************************************************/
2342
275k
    U08 *apu1_hpel_ref[4], *pu1_ref;
2343
2344
275k
    interp_prms_t s_interp_prms;
2345
2346
    /*************************************************************************/
2347
    /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2348
    /* points to the corresponding predicted buf with its stride.            */
2349
    /* Note that the pointer cannot be derived just from the id, since the   */
2350
    /* pointer may also point to the hpel buffer (in case we request interp  */
2351
    /* of a hpel pt, which already exists in the recon hpel planes)          */
2352
    /*************************************************************************/
2353
275k
    U08 *pu1_final_out;
2354
275k
    S32 i4_final_out_stride;
2355
275k
    S32 part_id;
2356
275k
    S32 check_for_duplicate = 0;
2357
2358
275k
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
2359
2360
275k
    S32 mvx_qpel;
2361
275k
    S32 mvy_qpel;
2362
2363
275k
    pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
2364
275k
        ps_func_selector,
2365
275k
        ps_me_optimised_function_list,
2366
275k
        i4_part_mask,
2367
275k
        ps_prms->i4_use_satd,
2368
275k
        ps_subpel_refine_ctxt->i4_num_valid_parts,
2369
275k
        ps_search_results->u1_num_results_per_part);
2370
2371
275k
    i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2372
275k
    i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2373
2374
    /* Prediction contet should now deal with qpel units */
2375
275k
    HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2376
2377
    /* Buffer allocation for subpel */
2378
    /* Current design is that there may be many partitions and different mvs */
2379
    /* that attempt subpel refinemnt. While there is possibility of overlap, the */
2380
    /* hashing to detect and avoid overlap may be very complex. So, currently,   */
2381
    /* the only thing done is to store the eventual predicted buffer with every  */
2382
    /* ctb node that holds the result of hte best subpel search */
2383
2384
    /* Compute the base pointer for input, interpolated buffers */
2385
    /* The base pointers point as follows: */
2386
    /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
2387
    /* To these, we need to add the offset of the current node */
2388
275k
    i4_ref_stride = ps_curr_layer->i4_rec_stride;
2389
275k
    i4_offset = x_off + (y_off * i4_ref_stride);
2390
275k
    i1_ref_idx = ps_search_node->i1_ref_idx;
2391
2392
275k
    apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
2393
275k
    apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
2394
275k
    apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
2395
275k
    apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
2396
2397
    /* Initialize result params used for partition update */
2398
275k
    s_result_prms.pf_mv_cost_compute = NULL;
2399
275k
    s_result_prms.ps_search_results = ps_search_results;
2400
275k
    s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
2401
275k
    s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
2402
275k
    s_result_prms.u1_pred_lx = search_idx;
2403
275k
    s_result_prms.i4_part_mask = i4_part_mask;
2404
275k
    s_result_prms.ps_search_node_base = ps_search_node;
2405
275k
    s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
2406
275k
    s_result_prms.i4_grid_mask = 1;
2407
275k
    s_result_prms.ps_search_node = &s_temp_search_node;
2408
275k
    s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
2409
2410
    /* convert to hpel units */
2411
275k
    i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
2412
275k
    i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
2413
2414
    /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
2415
275k
    ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
2416
275k
    i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
2417
275k
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2418
2419
275k
    i4_min_cost = MAX_32BIT_VAL;
2420
275k
    i4_min_sad = MAX_32BIT_VAL;
2421
2422
    /*************************************************************************/
2423
    /* Prepare the input params to SAD/SATD function. Note that input is     */
2424
    /* passed from the calling funcion since it may be I (normal subpel      */
2425
    /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
2426
    /* Both cases are handled here.                                          */
2427
    /*************************************************************************/
2428
275k
    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
2429
275k
    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
2430
275k
    s_err_prms.i4_ref_stride = i4_ref_stride;
2431
275k
    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
2432
275k
    s_err_prms.i4_grid_mask = 1;
2433
275k
    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
2434
275k
    s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
2435
275k
    s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
2436
2437
275k
    s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
2438
2439
275k
    part_id = ps_search_node->u1_part_id;
2440
445k
    for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
2441
408k
    {
2442
408k
        e_min_id = PT_C;
2443
2444
408k
        mvx_qpel = i4_mv_x << 1;
2445
408k
        mvy_qpel = i4_mv_y << 1;
2446
2447
        /* Central pt */
2448
408k
        if(i4_grid_mask & BIT_EN(PT_C))
2449
275k
        {
2450
            //ps_search_node->i2_mv_x = (S16)i4_mv_x;
2451
            //ps_search_node->i2_mv_x = (S16)i4_mv_y;
2452
            /* central pt is i4_mv_x, i4_mv_y */
2453
275k
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2454
275k
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
2455
2456
275k
            i4_frac_x = i4_mv_x & 1;
2457
275k
            i4_frac_y = i4_mv_y & 1;
2458
275k
            pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2459
275k
            s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2460
2461
            /* Update the mv's with the current candt motion vectors */
2462
275k
            s_result_prms.i2_mv_x = mvx_qpel;
2463
275k
            s_result_prms.i2_mv_y = mvy_qpel;
2464
275k
            s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2465
275k
            s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2466
2467
275k
            pf_err_compute(&s_err_prms, &s_result_prms);
2468
2469
275k
            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2470
275k
            if(i4_tot_cost < i4_min_cost)
2471
275k
            {
2472
275k
                i4_min_cost = i4_tot_cost;
2473
275k
                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2474
275k
                e_min_id = PT_C;
2475
275k
                pu1_final_out = s_err_prms.pu1_ref;
2476
275k
            }
2477
275k
        }
2478
2479
        /* left pt */
2480
408k
        if(i4_grid_mask & BIT_EN(PT_L))
2481
375k
        {
2482
375k
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2483
375k
                ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
2484
2485
375k
            if(!check_for_duplicate)
2486
363k
            {
2487
                /* search node mv is stored in qpel units */
2488
363k
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
2489
363k
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2490
                /* central pt is i4_mv_x - 1, i4_mv_y */
2491
363k
                i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
2492
363k
                i4_frac_y = i4_mv_y & 1;
2493
363k
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2494
363k
                s_err_prms.pu1_ref =
2495
363k
                    pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2496
2497
                /* Update the mv's with the current candt motion vectors */
2498
363k
                s_result_prms.i2_mv_x = mvx_qpel - 2;
2499
363k
                s_result_prms.i2_mv_y = mvy_qpel;
2500
363k
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
2501
363k
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2502
2503
363k
                pf_err_compute(&s_err_prms, &s_result_prms);
2504
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2505
363k
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2506
363k
                if(i4_tot_cost < i4_min_cost)
2507
80.3k
                {
2508
80.3k
                    i4_min_cost = i4_tot_cost;
2509
80.3k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2510
80.3k
                    e_min_id = PT_L;
2511
80.3k
                    pu1_final_out = s_err_prms.pu1_ref;
2512
80.3k
                }
2513
363k
            }
2514
375k
        }
2515
        /* top pt */
2516
408k
        if(i4_grid_mask & BIT_EN(PT_T))
2517
373k
        {
2518
373k
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2519
373k
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
2520
2521
373k
            if(!check_for_duplicate)
2522
360k
            {
2523
                /* search node mv is stored in qpel units */
2524
360k
                ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
2525
360k
                ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
2526
                /* top pt is i4_mv_x, i4_mv_y - 1 */
2527
360k
                i4_frac_x = i4_mv_x & 1;
2528
360k
                i4_frac_y = (i4_mv_y - 1) & 1;
2529
360k
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2530
360k
                s_err_prms.pu1_ref =
2531
360k
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
2532
2533
                /* Update the mv's with the current candt motion vectors */
2534
360k
                s_result_prms.i2_mv_x = mvx_qpel;
2535
360k
                s_result_prms.i2_mv_y = mvy_qpel - 2;
2536
360k
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2537
360k
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
2538
2539
360k
                pf_err_compute(&s_err_prms, &s_result_prms);
2540
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2541
360k
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2542
360k
                if(i4_tot_cost < i4_min_cost)
2543
56.2k
                {
2544
56.2k
                    i4_min_cost = i4_tot_cost;
2545
56.2k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2546
56.2k
                    e_min_id = PT_T;
2547
56.2k
                    pu1_final_out = s_err_prms.pu1_ref;
2548
56.2k
                }
2549
360k
            }
2550
373k
        }
2551
        /* right pt */
2552
408k
        if(i4_grid_mask & BIT_EN(PT_R))
2553
375k
        {
2554
375k
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2555
375k
                ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
2556
375k
            if(!check_for_duplicate)
2557
362k
            {
2558
                /* search node mv is stored in qpel units */
2559
362k
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
2560
362k
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2561
                /* right pt is i4_mv_x + 1, i4_mv_y */
2562
362k
                i4_frac_x = (i4_mv_x + 1) & 1;
2563
362k
                i4_frac_y = i4_mv_y & 1;
2564
2565
362k
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2566
362k
                s_err_prms.pu1_ref =
2567
362k
                    pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2568
2569
                /* Update the mv's with the current candt motion vectors */
2570
362k
                s_result_prms.i2_mv_x = mvx_qpel + 2;
2571
362k
                s_result_prms.i2_mv_y = mvy_qpel;
2572
362k
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
2573
362k
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2574
2575
362k
                pf_err_compute(&s_err_prms, &s_result_prms);
2576
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2577
362k
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2578
362k
                if(i4_tot_cost < i4_min_cost)
2579
49.9k
                {
2580
49.9k
                    i4_min_cost = i4_tot_cost;
2581
49.9k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2582
49.9k
                    e_min_id = PT_R;
2583
49.9k
                    pu1_final_out = s_err_prms.pu1_ref;
2584
49.9k
                }
2585
362k
            }
2586
375k
        }
2587
        /* bottom pt */
2588
408k
        if(i4_grid_mask & BIT_EN(PT_B))
2589
375k
        {
2590
375k
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2591
375k
                ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
2592
375k
            if(!check_for_duplicate)
2593
361k
            {
2594
                /* search node mv is stored in qpel units */
2595
361k
                ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
2596
361k
                ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
2597
361k
                i4_frac_x = i4_mv_x & 1;
2598
361k
                i4_frac_y = (i4_mv_y + 1) & 1;
2599
361k
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2600
361k
                s_err_prms.pu1_ref =
2601
361k
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
2602
2603
                /* Update the mv's with the current candt motion vectors */
2604
361k
                s_result_prms.i2_mv_x = mvx_qpel;
2605
361k
                s_result_prms.i2_mv_y = mvy_qpel + 2;
2606
361k
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2607
361k
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
2608
2609
361k
                pf_err_compute(&s_err_prms, &s_result_prms);
2610
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2611
361k
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2612
361k
                if(i4_tot_cost < i4_min_cost)
2613
42.6k
                {
2614
42.6k
                    i4_min_cost = i4_tot_cost;
2615
42.6k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2616
42.6k
                    e_min_id = PT_B;
2617
42.6k
                    pu1_final_out = s_err_prms.pu1_ref;
2618
42.6k
                }
2619
361k
            }
2620
375k
        }
2621
        /* Early exit in case of central point */
2622
408k
        if(e_min_id == PT_C)
2623
238k
            break;
2624
2625
        /*********************************************************************/
2626
        /* Depending on the best result location, we may be able to skip     */
2627
        /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
2628
        /* the best result, the next iteration need not do centre, left pts  */
2629
        /*********************************************************************/
2630
169k
        i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2631
169k
        i4_mv_x += gai1_grid_id_to_x[e_min_id];
2632
169k
        i4_mv_y += gai1_grid_id_to_y[e_min_id];
2633
169k
        ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2634
169k
        ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2635
169k
        i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2636
169k
    }
2637
2638
    /* Convert to QPEL units */
2639
275k
    i4_mv_x <<= 1;
2640
275k
    i4_mv_y <<= 1;
2641
2642
275k
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2643
275k
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2644
2645
    /* Exact interpolation or averaging chosen here */
2646
275k
    pf_qpel_interp = ps_prms->pf_qpel_interp;
2647
2648
    /* Next QPEL ME */
2649
    /* In this case, we have option of doing exact QPEL interpolation or avg */
2650
    /*************************************************************************/
2651
    /*        x                                                              */
2652
    /*    A b C d                                                            */
2653
    /*    e f g h                                                            */
2654
    /*    I j K l                                                            */
2655
    /*    m n o p                                                            */
2656
    /*    Q r S t                                                            */
2657
    /*                                                                       */
2658
    /*    Approximate QPEL logic                                             */
2659
    /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
2660
    /*    for any given pt, we can get all the information required about    */
2661
    /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
2662
    /*     surrounding pts info:                                             */
2663
    /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
2664
    /*           buffer 2: hxfy, offsets for both are 0, 0                   */
2665
    /*    similarly for other pts the info can be gotten                     */
2666
    /*************************************************************************/
2667
275k
    i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
2668
275k
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2669
2670
    /*************************************************************************/
2671
    /* One time preparation of non changing interpolation params. These      */
2672
    /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
2673
    /* working memory (not used though in case of averaging).                */
2674
    /*************************************************************************/
2675
275k
    s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
2676
275k
    s_interp_prms.i4_ref_stride = i4_ref_stride;
2677
275k
    s_interp_prms.i4_blk_wd = i4_blk_wd;
2678
275k
    s_interp_prms.i4_blk_ht = i4_blk_ht;
2679
2680
275k
    i4_final_out_stride = i4_ref_stride;
2681
2682
275k
    {
2683
275k
        U08 *pu1_mem;
2684
        /*********************************************************************/
2685
        /* Allocation of working memory for interpolated buffers. We maintain*/
2686
        /* an intermediate working buffer, and 2 ping pong interpolated out  */
2687
        /* buffers, purpose of ping pong explained later below               */
2688
        /*********************************************************************/
2689
275k
        pu1_mem = ps_prms->pu1_wkg_mem;
2690
275k
        s_interp_prms.pu1_wkg_mem = pu1_mem;
2691
2692
        //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
2693
275k
        s_interp_prms.apu1_interp_out[0] = pu1_mem;
2694
2695
275k
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2696
275k
        s_interp_prms.apu1_interp_out[1] = pu1_mem;
2697
2698
275k
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2699
275k
        s_interp_prms.apu1_interp_out[2] = pu1_mem;
2700
2701
275k
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2702
275k
        s_interp_prms.apu1_interp_out[3] = pu1_mem;
2703
2704
275k
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2705
275k
        s_interp_prms.apu1_interp_out[4] = pu1_mem;
2706
2707
        /*********************************************************************/
2708
        /* Stride of interpolated output is just a function of blk width of  */
2709
        /* this partition and hence remains constant for this partition      */
2710
        /*********************************************************************/
2711
275k
        s_interp_prms.i4_out_stride = (i4_blk_wd);
2712
275k
    }
2713
2714
275k
    {
2715
275k
        UWORD8 *apu1_final[4];
2716
275k
        WORD32 ai4_ref_stride[4];
2717
        /*************************************************************************/
2718
        /* Ping pong design for interpolated buffers. We use a min id, which     */
2719
        /* tracks the id of the ppu1_interp_out that stores the best result.     */
2720
        /* When new interp to be done, it uses 1 - bes result id to do the interp*/
2721
        /* min id is toggled when any new result becomes the best result.        */
2722
        /*************************************************************************/
2723
2724
408k
        for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
2725
373k
        {
2726
373k
            e_min_id = PT_C;
2727
2728
373k
            mvx_qpel = i4_mv_x;
2729
373k
            mvy_qpel = i4_mv_y;
2730
373k
            hme_qpel_interp_comprehensive(
2731
373k
                &s_interp_prms,
2732
373k
                apu1_final,
2733
373k
                ai4_ref_stride,
2734
373k
                i4_mv_x,
2735
373k
                i4_mv_y,
2736
373k
                i4_grid_mask,
2737
373k
                ps_me_optimised_function_list);
2738
373k
            if(i4_grid_mask & BIT_EN(PT_L))
2739
347k
            {
2740
347k
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2741
347k
                    ps_dedup_enabler,
2742
347k
                    num_unique_nodes,
2743
347k
                    mvx_qpel - 1,
2744
347k
                    mvy_qpel - 0,
2745
347k
                    check_for_duplicate);
2746
2747
347k
                if(!check_for_duplicate)
2748
332k
                {
2749
332k
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
2750
332k
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2751
2752
332k
                    s_err_prms.pu1_ref = apu1_final[0];
2753
332k
                    s_err_prms.i4_ref_stride = ai4_ref_stride[0];
2754
2755
                    /* Update the mv's with the current candt motion vectors */
2756
332k
                    s_result_prms.i2_mv_x = mvx_qpel - 1;
2757
332k
                    s_result_prms.i2_mv_y = mvy_qpel;
2758
332k
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
2759
332k
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2760
2761
332k
                    pf_err_compute(&s_err_prms, &s_result_prms);
2762
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2763
2764
332k
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2765
332k
                    if(i4_tot_cost < i4_min_cost)
2766
56.7k
                    {
2767
56.7k
                        e_min_id = PT_L;
2768
56.7k
                        i4_min_cost = i4_tot_cost;
2769
56.7k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2770
56.7k
                    }
2771
332k
                }
2772
347k
            }
2773
373k
            if(i4_grid_mask & BIT_EN(PT_T))
2774
344k
            {
2775
344k
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2776
344k
                    ps_dedup_enabler,
2777
344k
                    num_unique_nodes,
2778
344k
                    mvx_qpel - 0,
2779
344k
                    mvy_qpel - 1,
2780
344k
                    check_for_duplicate);
2781
2782
344k
                if(!check_for_duplicate)
2783
322k
                {
2784
322k
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2785
322k
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
2786
2787
322k
                    s_err_prms.pu1_ref = apu1_final[1];
2788
322k
                    s_err_prms.i4_ref_stride = ai4_ref_stride[1];
2789
2790
                    /* Update the mv's with the current candt motion vectors */
2791
322k
                    s_result_prms.i2_mv_x = mvx_qpel;
2792
322k
                    s_result_prms.i2_mv_y = mvy_qpel - 1;
2793
2794
322k
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2795
322k
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
2796
2797
322k
                    pf_err_compute(&s_err_prms, &s_result_prms);
2798
2799
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2800
322k
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2801
322k
                    if(i4_tot_cost < i4_min_cost)
2802
43.2k
                    {
2803
43.2k
                        e_min_id = PT_T;
2804
43.2k
                        i4_min_cost = i4_tot_cost;
2805
43.2k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2806
43.2k
                    }
2807
322k
                }
2808
344k
            }
2809
373k
            if(i4_grid_mask & BIT_EN(PT_R))
2810
346k
            {
2811
346k
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2812
346k
                    ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
2813
2814
346k
                if(!check_for_duplicate)
2815
332k
                {
2816
332k
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
2817
332k
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2818
2819
332k
                    s_err_prms.pu1_ref = apu1_final[2];
2820
332k
                    s_err_prms.i4_ref_stride = ai4_ref_stride[2];
2821
2822
                    /* Update the mv's with the current candt motion vectors */
2823
332k
                    s_result_prms.i2_mv_x = mvx_qpel + 1;
2824
332k
                    s_result_prms.i2_mv_y = mvy_qpel;
2825
2826
332k
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
2827
332k
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2828
2829
332k
                    pf_err_compute(&s_err_prms, &s_result_prms);
2830
2831
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2832
2833
332k
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2834
332k
                    if(i4_tot_cost < i4_min_cost)
2835
38.2k
                    {
2836
38.2k
                        e_min_id = PT_R;
2837
38.2k
                        i4_min_cost = i4_tot_cost;
2838
38.2k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2839
38.2k
                    }
2840
332k
                }
2841
346k
            }
2842
            /* i4_mv_x and i4_mv_y will always be the centre pt */
2843
            /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
2844
373k
            if(i4_grid_mask & BIT_EN(PT_B))
2845
344k
            {
2846
344k
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2847
344k
                    ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
2848
2849
344k
                if(!check_for_duplicate)
2850
323k
                {
2851
323k
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2852
323k
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
2853
2854
323k
                    s_err_prms.pu1_ref = apu1_final[3];
2855
323k
                    s_err_prms.i4_ref_stride = ai4_ref_stride[3];
2856
2857
                    /* Update the mv's with the current candt motion vectors */
2858
323k
                    s_result_prms.i2_mv_x = mvx_qpel;
2859
323k
                    s_result_prms.i2_mv_y = mvy_qpel + 1;
2860
2861
323k
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2862
323k
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
2863
2864
323k
                    pf_err_compute(&s_err_prms, &s_result_prms);
2865
2866
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2867
323k
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2868
323k
                    if(i4_tot_cost < i4_min_cost)
2869
34.1k
                    {
2870
34.1k
                        e_min_id = PT_B;
2871
34.1k
                        i4_min_cost = i4_tot_cost;
2872
34.1k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2873
34.1k
                    }
2874
323k
                }
2875
344k
            }
2876
2877
            /* New QPEL mv x and y */
2878
373k
            if(e_min_id == PT_C)
2879
239k
                break;
2880
133k
            i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2881
133k
            i4_mv_x += gai1_grid_id_to_x[e_min_id];
2882
133k
            i4_mv_y += gai1_grid_id_to_y[e_min_id];
2883
133k
            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2884
133k
            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2885
133k
            i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2886
133k
        }
2887
275k
    }
2888
2889
    /* update modified motion vectors and cost at end of subpel */
2890
275k
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2891
275k
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2892
275k
    ps_search_node->i4_tot_cost = i4_min_cost;
2893
275k
    ps_search_node->i4_sad = i4_min_sad;
2894
2895
    /********************************************************************************/
2896
    /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
2897
    /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
2898
    /********************************************************************************/
2899
    //ps_pred_ctxt->lambda >>= 1;
2900
2901
275k
    return (i4_min_cost);
2902
275k
}
2903
#elif DIAMOND_GRID == 0
2904
S32 hme_subpel_refine_search_node_high_speed(
2905
    search_node_t *ps_search_node,
2906
    hme_subpel_prms_t *ps_prms,
2907
    layer_ctxt_t *ps_curr_layer,
2908
    BLK_SIZE_T e_blk_size,
2909
    S32 x_off,
2910
    S32 y_off,
2911
    search_results_t *ps_search_results,
2912
    S32 pred_lx,
2913
    S32 i4_part_mask,
2914
    S32 *pi4_valid_part_ids,
2915
    S32 search_idx,
2916
    subpel_dedup_enabler_t *ps_dedup_enabler,
2917
    me_func_selector_t *ps_func_selector)
2918
{
2919
    S32 i4_num_hpel_refine, i4_num_qpel_refine;
2920
    S32 i4_offset, i4_grid_mask;
2921
    S08 i1_ref_idx;
2922
    S32 i4_blk_wd, i4_blk_ht;
2923
    S32 i4_ref_stride, i4_i;
2924
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2925
    result_upd_prms_t s_result_prms;
2926
2927
    /*************************************************************************/
2928
    /* Tracks current MV with the fractional component.                      */
2929
    /*************************************************************************/
2930
    S32 i4_mv_x, i4_mv_y;
2931
    S32 i4_frac_x, i4_frac_y;
2932
2933
    /*************************************************************************/
2934
    /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2935
    /* This function                                                         */
2936
    /*************************************************************************/
2937
    PF_SAD_FXN_T pf_err_compute;
2938
    S32 ai4_sad_grid[9][17], i4_tot_cost;
2939
    err_prms_t s_err_prms;
2940
2941
    /*************************************************************************/
2942
    /* Allowed MV RANGE                                                      */
2943
    /*************************************************************************/
2944
    range_prms_t *ps_range_prms;
2945
2946
    /*************************************************************************/
2947
    /* stores min id in grid with associated min cost.                       */
2948
    /*************************************************************************/
2949
    S32 i4_min_cost, i4_min_sad;
2950
    GRID_PT_T e_min_id;
2951
2952
    PF_INTERP_FXN_T pf_qpel_interp;
2953
    /*************************************************************************/
2954
    /* For hpel and qpel we move in diamonds and hence each point in the     */
2955
    /* diamond will belong to a completely different plane. To simplify the  */
2956
    /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2957
    /* hpel planes which are interpolated during recon.                      */
2958
    /*************************************************************************/
2959
    U08 *apu1_hpel_ref[4], *pu1_ref;
2960
2961
    interp_prms_t s_interp_prms;
2962
2963
    /*************************************************************************/
2964
    /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2965
    /* points to the corresponding predicted buf with its stride.            */
2966
    /* Note that the pointer cannot be derived just from the id, since the   */
2967
    /* pointer may also point to the hpel buffer (in case we request interp  */
2968
    /* of a hpel pt, which already exists in the recon hpel planes)          */
2969
    /*************************************************************************/
2970
    U08 *pu1_final_out;
2971
    S32 i4_final_out_stride;
2972
    S32 part_id;
2973
    S32 check_for_duplicate = 0;
2974
2975
    S32 mvx_qpel;
2976
    S32 mvy_qpel;
2977
2978
    /*************************************************************************/
2979
    /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
2980
    /* fixed through this subpel refinement for this partition.              */
2981
    /* Note, we do not enable grid sads since each pt is different buffers.  */
2982
    /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled.   */
2983
    /*************************************************************************/
2984
    if(ps_prms->i4_use_satd)
2985
    {
2986
        pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
2987
    }
2988
    else
2989
    {
2990
        pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
2991
    }
2992
2993
    i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2994
    i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2995
2996
    /* Prediction contet should now deal with qpel units */
2997
    HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2998
2999
    /* Buffer allocation for subpel */
3000
    /* Current design is that there may be many partitions and different mvs */
3001
    /* that attempt subpel refinemnt. While there is possibility of overlap, the */
3002
    /* hashing to detect and avoid overlap may be very complex. So, currently,   */
3003
    /* the only thing done is to store the eventual predicted buffer with every  */
3004
    /* ctb node that holds the result of hte best subpel search */
3005
3006
    /* Compute the base pointer for input, interpolated buffers */
3007
    /* The base pointers point as follows:
3008
    /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
3009
    /* To these, we need to add the offset of the current node */
3010
    i4_ref_stride = ps_curr_layer->i4_rec_stride;
3011
    i4_offset = x_off + (y_off * i4_ref_stride);
3012
    i1_ref_idx = ps_search_node->i1_ref_idx;
3013
3014
    apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
3015
    apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
3016
    apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
3017
    apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
3018
3019
    /* Initialize result params used for partition update */
3020
    s_result_prms.pf_mv_cost_compute = NULL;
3021
    s_result_prms.ps_search_results = ps_search_results;
3022
    s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
3023
    s_result_prms.i1_ref_idx = search_idx;
3024
    s_result_prms.i4_part_mask = i4_part_mask;
3025
    s_result_prms.ps_search_node_base = ps_search_node;
3026
    s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3027
    s_result_prms.i4_grid_mask = 1;
3028
3029
    /* convert to hpel units */
3030
    i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
3031
    i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
3032
3033
    /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
3034
    ps_range_prms = ps_prms->ps_mv_range_qpel;
3035
    i4_grid_mask = (GRID_ALL_PTS_VALID);
3036
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3037
3038
    i4_min_cost = MAX_32BIT_VAL;
3039
    i4_min_sad = MAX_32BIT_VAL;
3040
3041
    /*************************************************************************/
3042
    /* Prepare the input params to SAD/SATD function. Note that input is     */
3043
    /* passed from the calling funcion since it may be I (normal subpel      */
3044
    /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
3045
    /* Both cases are handled here.                                          */
3046
    /*************************************************************************/
3047
    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
3048
    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
3049
    s_err_prms.i4_ref_stride = i4_ref_stride;
3050
    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
3051
    s_err_prms.i4_grid_mask = 1;
3052
    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3053
    s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
3054
    s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
3055
3056
    /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
3057
    //ps_pred_ctxt->lambda <<= 1;
3058
    part_id = ps_search_node->u1_part_id;
3059
    for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
3060
    {
3061
        e_min_id = PT_C;
3062
3063
        mvx_qpel = i4_mv_x << 1;
3064
        mvy_qpel = i4_mv_y << 1;
3065
3066
        /* Central pt */
3067
        if(i4_grid_mask & BIT_EN(PT_C))
3068
        {
3069
            //ps_search_node->i2_mv_x = (S16)i4_mv_x;
3070
            //ps_search_node->i2_mv_x = (S16)i4_mv_y;
3071
            /* central pt is i4_mv_x, i4_mv_y */
3072
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3073
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
3074
3075
            i4_frac_x = i4_mv_x & 1;
3076
            i4_frac_y = i4_mv_y & 1;
3077
            pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3078
            s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3079
            pf_err_compute(&s_err_prms);
3080
            /* Update the mv's with the current candt motion vectors */
3081
            s_result_prms.i2_mv_x = mvx_qpel;
3082
            s_result_prms.i2_mv_y = mvy_qpel;
3083
            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3084
            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3085
            if(i4_tot_cost < i4_min_cost)
3086
            {
3087
                i4_min_cost = i4_tot_cost;
3088
                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3089
                e_min_id = PT_C;
3090
                pu1_final_out = s_err_prms.pu1_ref;
3091
            }
3092
        }
3093
3094
        /* left pt */
3095
        if(i4_grid_mask & BIT_EN(PT_L))
3096
        {
3097
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3098
                ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
3099
3100
            if(!check_for_duplicate)
3101
            {
3102
                /* search node mv is stored in qpel units */
3103
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
3104
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3105
                /* central pt is i4_mv_x - 1, i4_mv_y */
3106
                i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
3107
                i4_frac_y = i4_mv_y & 1;
3108
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3109
                s_err_prms.pu1_ref =
3110
                    pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3111
3112
                pf_err_compute(&s_err_prms);
3113
                /* Update the mv's with the current candt motion vectors */
3114
                s_result_prms.i2_mv_x = mvx_qpel;
3115
                s_result_prms.i2_mv_y = mvy_qpel;
3116
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3117
3118
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3119
3120
                if(i4_tot_cost < i4_min_cost)
3121
                {
3122
                    i4_min_cost = i4_tot_cost;
3123
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3124
                    e_min_id = PT_L;
3125
                    pu1_final_out = s_err_prms.pu1_ref;
3126
                }
3127
            }
3128
        }
3129
        /* top pt */
3130
        if(i4_grid_mask & BIT_EN(PT_T))
3131
        {
3132
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3133
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
3134
3135
            if(!check_for_duplicate)
3136
            {
3137
                /* search node mv is stored in qpel units */
3138
                ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3139
                ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
3140
                /* top pt is i4_mv_x, i4_mv_y - 1 */
3141
                i4_frac_x = i4_mv_x & 1;
3142
                i4_frac_y = (i4_mv_y - 1) & 1;
3143
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3144
                s_err_prms.pu1_ref =
3145
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
3146
                pf_err_compute(&s_err_prms);
3147
                /* Update the mv's with the current candt motion vectors */
3148
                s_result_prms.i2_mv_x = mvx_qpel;
3149
                s_result_prms.i2_mv_y = mvy_qpel - 2;
3150
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3151
3152
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3153
3154
                if(i4_tot_cost < i4_min_cost)
3155
                {
3156
                    i4_min_cost = i4_tot_cost;
3157
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3158
                    e_min_id = PT_T;
3159
                    pu1_final_out = s_err_prms.pu1_ref;
3160
                }
3161
            }
3162
        }
3163
        /* right pt */
3164
        if(i4_grid_mask & BIT_EN(PT_R))
3165
        {
3166
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3167
                ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
3168
3169
            if(!check_for_duplicate)
3170
            {
3171
                /* search node mv is stored in qpel units */
3172
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
3173
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3174
                /* right pt is i4_mv_x + 1, i4_mv_y */
3175
                i4_frac_x = (i4_mv_x + 1) & 1;
3176
                i4_frac_y = i4_mv_y & 1;
3177
3178
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3179
                s_err_prms.pu1_ref =
3180
                    pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3181
                pf_err_compute(&s_err_prms);
3182
                /* Update the mv's with the current candt motion vectors */
3183
                s_result_prms.i2_mv_x = mvx_qpel + 2;
3184
                s_result_prms.i2_mv_y = mvy_qpel;
3185
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3186
3187
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3188
3189
                if(i4_tot_cost < i4_min_cost)
3190
                {
3191
                    i4_min_cost = i4_tot_cost;
3192
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3193
                    e_min_id = PT_R;
3194
                    pu1_final_out = s_err_prms.pu1_ref;
3195
                }
3196
            }
3197
        }
3198
        /* bottom pt */
3199
        if(i4_grid_mask & BIT_EN(PT_B))
3200
        {
3201
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3202
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
3203
3204
            if(!check_for_duplicate)
3205
            {
3206
                /* search node mv is stored in qpel units */
3207
                ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
3208
                ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
3209
                i4_frac_x = i4_mv_x & 1;
3210
                i4_frac_y = (i4_mv_y + 1) & 1;
3211
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3212
                s_err_prms.pu1_ref =
3213
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
3214
3215
                pf_err_compute(&s_err_prms);
3216
                /* Update the mv's with the current candt motion vectors */
3217
                s_result_prms.i2_mv_x = mvx_qpel;
3218
                s_result_prms.i2_mv_y = mvy_qpel + 2;
3219
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3220
3221
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3222
3223
                if(i4_tot_cost < i4_min_cost)
3224
                {
3225
                    i4_min_cost = i4_tot_cost;
3226
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3227
                    e_min_id = PT_B;
3228
                    pu1_final_out = s_err_prms.pu1_ref;
3229
                }
3230
            }
3231
        }
3232
        if(e_min_id == PT_C)
3233
        {
3234
            if(!i4_i)
3235
            {
3236
                /* TL pt */
3237
                if(i4_grid_mask & BIT_EN(PT_TL))
3238
                {
3239
                    S32 mvx_minus_1 = (i4_mv_x - 1);
3240
                    S32 mvy_minus_1 = (i4_mv_y - 1);
3241
3242
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3243
                        ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
3244
3245
                    if(!check_for_duplicate)
3246
                    {
3247
                        /* search node mv is stored in qpel units */
3248
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3249
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3250
                        i4_frac_x = mvx_minus_1 & 1;
3251
                        i4_frac_y = mvy_minus_1 & 1;
3252
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3253
                        s_err_prms.pu1_ref =
3254
                            pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3255
3256
                        pf_err_compute(&s_err_prms);
3257
                        /* Update the mv's with the current candt motion vectors */
3258
                        s_result_prms.i2_mv_x = mvx_qpel - 2;
3259
                        s_result_prms.i2_mv_y = mvy_qpel - 2;
3260
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3261
3262
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3263
3264
                        if(i4_tot_cost < i4_min_cost)
3265
                        {
3266
                            i4_min_cost = i4_tot_cost;
3267
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3268
                            e_min_id = PT_TL;
3269
                            pu1_final_out = s_err_prms.pu1_ref;
3270
                        }
3271
                    }
3272
                }
3273
                /* TR pt */
3274
                if(i4_grid_mask & BIT_EN(PT_TR))
3275
                {
3276
                    S32 mvx_plus_1 = (i4_mv_x + 1);
3277
                    S32 mvy_minus_1 = (i4_mv_y - 1);
3278
3279
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3280
                        ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
3281
3282
                    if(!check_for_duplicate)
3283
                    {
3284
                        /* search node mv is stored in qpel units */
3285
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3286
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3287
                        i4_frac_x = mvx_plus_1 & 1;
3288
                        i4_frac_y = mvy_minus_1 & 1;
3289
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3290
                        s_err_prms.pu1_ref =
3291
                            pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3292
3293
                        pf_err_compute(&s_err_prms);
3294
                        /* Update the mv's with the current candt motion vectors */
3295
                        s_result_prms.i2_mv_x = mvx_qpel + 2;
3296
                        s_result_prms.i2_mv_y = mvy_qpel - 2;
3297
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3298
3299
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3300
3301
                        if(i4_tot_cost < i4_min_cost)
3302
                        {
3303
                            i4_min_cost = i4_tot_cost;
3304
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3305
                            e_min_id = PT_TR;
3306
                            pu1_final_out = s_err_prms.pu1_ref;
3307
                        }
3308
                    }
3309
                }
3310
                /* BL pt */
3311
                if(i4_grid_mask & BIT_EN(PT_BL))
3312
                {
3313
                    S32 mvx_minus_1 = (i4_mv_x - 1);
3314
                    S32 mvy_plus_1 = (i4_mv_y + 1);
3315
3316
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3317
                        ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
3318
3319
                    if(!check_for_duplicate)
3320
                    {
3321
                        /* search node mv is stored in qpel units */
3322
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3323
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3324
                        i4_frac_x = mvx_minus_1 & 1;
3325
                        i4_frac_y = mvy_plus_1 & 1;
3326
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3327
                        s_err_prms.pu1_ref =
3328
                            pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3329
3330
                        pf_err_compute(&s_err_prms);
3331
                        /* Update the mv's with the current candt motion vectors */
3332
                        s_result_prms.i2_mv_x = mvx_qpel - 2;
3333
                        s_result_prms.i2_mv_y = mvy_qpel + 2;
3334
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3335
3336
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3337
3338
                        if(i4_tot_cost < i4_min_cost)
3339
                        {
3340
                            i4_min_cost = i4_tot_cost;
3341
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3342
                            e_min_id = PT_BL;
3343
                            pu1_final_out = s_err_prms.pu1_ref;
3344
                        }
3345
                    }
3346
                }
3347
                /* BR pt */
3348
                if(i4_grid_mask & BIT_EN(PT_BR))
3349
                {
3350
                    S32 mvx_plus_1 = (i4_mv_x + 1);
3351
                    S32 mvy_plus_1 = (i4_mv_y + 1);
3352
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3353
                        ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
3354
3355
                    if(!check_for_duplicate)
3356
                    {
3357
                        /* search node mv is stored in qpel units */
3358
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3359
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3360
                        i4_frac_x = mvx_plus_1 & 1;
3361
                        i4_frac_y = mvy_plus_1 & 1;
3362
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3363
                        s_err_prms.pu1_ref =
3364
                            pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3365
3366
                        pf_err_compute(&s_err_prms);
3367
                        /* Update the mv's with the current candt motion vectors */
3368
                        s_result_prms.i2_mv_x = mvx_qpel + 2;
3369
                        s_result_prms.i2_mv_y = mvy_qpel + 2;
3370
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3371
3372
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3373
3374
                        if(i4_tot_cost < i4_min_cost)
3375
                        {
3376
                            i4_min_cost = i4_tot_cost;
3377
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3378
                            e_min_id = PT_BR;
3379
                            pu1_final_out = s_err_prms.pu1_ref;
3380
                        }
3381
                    }
3382
                }
3383
                if(e_min_id == PT_C)
3384
                {
3385
                    break;
3386
                }
3387
            }
3388
            else
3389
            {
3390
                break;
3391
            }
3392
        }
3393
3394
        /*********************************************************************/
3395
        /* Depending on the best result location, we may be able to skip     */
3396
        /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
3397
        /* the best result, the next iteration need not do centre, left pts  */
3398
        /*********************************************************************/
3399
        if(i4_i)
3400
        {
3401
            i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3402
        }
3403
        else
3404
        {
3405
            i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3406
        }
3407
        i4_mv_x += gai1_grid_id_to_x[e_min_id];
3408
        i4_mv_y += gai1_grid_id_to_y[e_min_id];
3409
        ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3410
        ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3411
        i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3412
    }
3413
3414
    /* Convert to QPEL units */
3415
    i4_mv_x <<= 1;
3416
    i4_mv_y <<= 1;
3417
3418
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3419
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3420
3421
    /* Early exit if this partition is visiting same hpel mv again */
3422
    /* Assumption : Checkin for early exit in best result of partition */
3423
    if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
3424
        ps_search_node->s_mv.i2_mvx) &&
3425
       (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
3426
        ps_search_node->s_mv.i2_mvy))
3427
    {
3428
        return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
3429
    }
3430
    else
3431
    {
3432
        /* Store the best hpel mv for future early exit checks */
3433
        ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
3434
            (S16)i4_mv_x;
3435
        ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
3436
            (S16)i4_mv_y;
3437
    }
3438
3439
    /* Early exit if this partition is visiting same hpel mv again */
3440
    /* Assumption : Checkin for early exit in second best result of partition */
3441
    if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
3442
        ps_search_node->s_mv.i2_mvx) &&
3443
       (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
3444
        ps_search_node->s_mv.i2_mvy))
3445
    {
3446
        return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
3447
    }
3448
    else
3449
    {
3450
        /* Store the best hpel mv for future early exit checks */
3451
        ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
3452
            (S16)i4_mv_x;
3453
        ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
3454
            (S16)i4_mv_y;
3455
    }
3456
3457
    /* Exact interpolation or averaging chosen here */
3458
    pf_qpel_interp = ps_prms->pf_qpel_interp;
3459
3460
    /* Next QPEL ME */
3461
    /* In this case, we have option of doing exact QPEL interpolation or avg */
3462
    /*************************************************************************/
3463
    /*        x                                                              */
3464
    /*    A b C d                                                            */
3465
    /*    e f g h                                                            */
3466
    /*    I j K l                                                            */
3467
    /*    m n o p                                                            */
3468
    /*    Q r S t                                                            */
3469
    /*                                                                       */
3470
    /*    Approximate QPEL logic                                             */
3471
    /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
3472
    /*    for any given pt, we can get all the information required about    */
3473
    /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
3474
    /*     surrounding pts info:                                             */
3475
    /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
3476
    /*           buffer 2: hxfy, offsets for both are 0, 0                   */
3477
    /*    similarly for other pts the info can be gotten                     */
3478
    /*************************************************************************/
3479
    i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
3480
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3481
3482
    /*************************************************************************/
3483
    /* One time preparation of non changing interpolation params. These      */
3484
    /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
3485
    /* working memory (not used though in case of averaging).                */
3486
    /*************************************************************************/
3487
    s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
3488
    s_interp_prms.i4_ref_stride = i4_ref_stride;
3489
    s_interp_prms.i4_blk_wd = i4_blk_wd;
3490
    s_interp_prms.i4_blk_ht = i4_blk_ht;
3491
3492
    i4_final_out_stride = i4_ref_stride;
3493
3494
    {
3495
        U08 *pu1_mem;
3496
        /*********************************************************************/
3497
        /* Allocation of working memory for interpolated buffers. We maintain*/
3498
        /* an intermediate working buffer, and 2 ping pong interpolated out  */
3499
        /* buffers, purpose of ping pong explained later below               */
3500
        /*********************************************************************/
3501
        pu1_mem = ps_prms->pu1_wkg_mem;
3502
        s_interp_prms.pu1_wkg_mem = pu1_mem;
3503
3504
        //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
3505
        s_interp_prms.apu1_interp_out[0] = pu1_mem;
3506
3507
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3508
        s_interp_prms.apu1_interp_out[1] = pu1_mem;
3509
3510
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3511
        s_interp_prms.apu1_interp_out[2] = pu1_mem;
3512
3513
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3514
        s_interp_prms.apu1_interp_out[3] = pu1_mem;
3515
3516
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3517
        s_interp_prms.apu1_interp_out[4] = pu1_mem;
3518
3519
        /*********************************************************************/
3520
        /* Stride of interpolated output is just a function of blk width of  */
3521
        /* this partition and hence remains constant for this partition      */
3522
        /*********************************************************************/
3523
        s_interp_prms.i4_out_stride = (i4_blk_wd);
3524
    }
3525
3526
    {
3527
        UWORD8 *apu1_final[4];
3528
        WORD32 ai4_ref_stride[4];
3529
        /*************************************************************************/
3530
        /* Ping pong design for interpolated buffers. We use a min id, which     */
3531
        /* tracks the id of the ppu1_interp_out that stores the best result.     */
3532
        /* When new interp to be done, it uses 1 - bes result id to do the interp*/
3533
        /* min id is toggled when any new result becomes the best result.        */
3534
        /*************************************************************************/
3535
3536
        for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
3537
        {
3538
            e_min_id = PT_C;
3539
3540
            hme_qpel_interp_comprehensive(
3541
                &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
3542
3543
            mvx_qpel = i4_mv_x;
3544
            mvy_qpel = i4_mv_y;
3545
3546
            if(i4_grid_mask & BIT_EN(PT_L))
3547
            {
3548
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3549
                    ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
3550
3551
                if(!check_for_duplicate)
3552
                {
3553
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3554
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3555
3556
                    s_err_prms.pu1_ref = apu1_final[0];
3557
                    s_err_prms.i4_ref_stride = ai4_ref_stride[0];
3558
3559
                    pf_err_compute(&s_err_prms);
3560
                    /* Update the mv's with the current candt motion vectors */
3561
                    s_result_prms.i2_mv_x = mvx_qpel - 1;
3562
                    s_result_prms.i2_mv_y = mvy_qpel;
3563
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3564
3565
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3566
                    if(i4_tot_cost < i4_min_cost)
3567
                    {
3568
                        e_min_id = PT_L;
3569
                        i4_min_cost = i4_tot_cost;
3570
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3571
                    }
3572
                }
3573
            }
3574
            if(i4_grid_mask & BIT_EN(PT_T))
3575
            {
3576
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3577
                    ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
3578
3579
                if(!check_for_duplicate)
3580
                {
3581
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3582
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3583
3584
                    s_err_prms.pu1_ref = apu1_final[1];
3585
                    s_err_prms.i4_ref_stride = ai4_ref_stride[1];
3586
3587
                    pf_err_compute(&s_err_prms);
3588
                    /* Update the mv's with the current candt motion vectors */
3589
                    s_result_prms.i2_mv_x = mvx_qpel;
3590
                    s_result_prms.i2_mv_y = mvy_qpel - 1;
3591
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3592
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3593
                    if(i4_tot_cost < i4_min_cost)
3594
                    {
3595
                        e_min_id = PT_T;
3596
                        i4_min_cost = i4_tot_cost;
3597
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3598
                    }
3599
                }
3600
            }
3601
            if(i4_grid_mask & BIT_EN(PT_R))
3602
            {
3603
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3604
                    ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
3605
3606
                if(!check_for_duplicate)
3607
                {
3608
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3609
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3610
3611
                    s_err_prms.pu1_ref = apu1_final[2];
3612
                    s_err_prms.i4_ref_stride = ai4_ref_stride[2];
3613
3614
                    pf_err_compute(&s_err_prms);
3615
                    /* Update the mv's with the current candt motion vectors */
3616
                    s_result_prms.i2_mv_x = mvx_qpel + 1;
3617
                    s_result_prms.i2_mv_y = mvy_qpel;
3618
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3619
3620
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3621
                    if(i4_tot_cost < i4_min_cost)
3622
                    {
3623
                        e_min_id = PT_R;
3624
                        i4_min_cost = i4_tot_cost;
3625
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3626
                    }
3627
                }
3628
            }
3629
            /* i4_mv_x and i4_mv_y will always be the centre pt */
3630
            /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3631
            if(i4_grid_mask & BIT_EN(PT_B))
3632
            {
3633
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3634
                    ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
3635
3636
                if(!check_for_duplicate)
3637
                {
3638
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3639
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3640
3641
                    s_err_prms.pu1_ref = apu1_final[3];
3642
                    s_err_prms.i4_ref_stride = ai4_ref_stride[3];
3643
3644
                    pf_err_compute(&s_err_prms);
3645
                    /* Update the mv's with the current candt motion vectors */
3646
                    s_result_prms.i2_mv_x = mvx_qpel;
3647
                    s_result_prms.i2_mv_y = mvy_qpel + 1;
3648
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3649
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3650
                    if(i4_tot_cost < i4_min_cost)
3651
                    {
3652
                        e_min_id = PT_B;
3653
                        i4_min_cost = i4_tot_cost;
3654
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3655
                    }
3656
                }
3657
            }
3658
3659
            if(e_min_id == PT_C)
3660
            {
3661
                if(!i4_i)
3662
                {
3663
                    S32 i4_interp_buf_id = 0;
3664
3665
                    if(i4_grid_mask & BIT_EN(PT_TL))
3666
                    {
3667
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3668
                            ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
3669
3670
                        if(!check_for_duplicate)
3671
                        {
3672
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3673
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3674
3675
                            /* Carry out the interpolation */
3676
                            pf_qpel_interp(
3677
                                &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
3678
3679
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3680
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3681
3682
                            pf_err_compute(&s_err_prms);
3683
                            /* Update the mv's with the current candt motion vectors */
3684
                            s_result_prms.i2_mv_x = mvx_qpel - 1;
3685
                            s_result_prms.i2_mv_y = mvy_qpel - 1;
3686
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3687
3688
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3689
3690
                            if(i4_tot_cost < i4_min_cost)
3691
                            {
3692
                                e_min_id = PT_TL;
3693
                                i4_min_cost = i4_tot_cost;
3694
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3695
                            }
3696
                        }
3697
                    }
3698
                    if(i4_grid_mask & BIT_EN(PT_TR))
3699
                    {
3700
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3701
                            ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
3702
3703
                        if(!check_for_duplicate)
3704
                        {
3705
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3706
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3707
3708
                            /* Carry out the interpolation */
3709
                            pf_qpel_interp(
3710
                                &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
3711
3712
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3713
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3714
3715
                            pf_err_compute(&s_err_prms);
3716
                            /* Update the mv's with the current candt motion vectors */
3717
                            s_result_prms.i2_mv_x = mvx_qpel + 1;
3718
                            s_result_prms.i2_mv_y = mvy_qpel - 1;
3719
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3720
3721
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3722
3723
                            if(i4_tot_cost < i4_min_cost)
3724
                            {
3725
                                e_min_id = PT_TR;
3726
                                i4_min_cost = i4_tot_cost;
3727
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3728
                            }
3729
                        }
3730
                    }
3731
                    if(i4_grid_mask & BIT_EN(PT_BL))
3732
                    {
3733
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3734
                            ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
3735
3736
                        if(!check_for_duplicate)
3737
                        {
3738
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3739
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3740
3741
                            /* Carry out the interpolation */
3742
                            pf_qpel_interp(
3743
                                &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
3744
3745
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3746
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3747
3748
                            pf_err_compute(&s_err_prms);
3749
                            /* Update the mv's with the current candt motion vectors */
3750
                            s_result_prms.i2_mv_x = mvx_qpel - 1;
3751
                            s_result_prms.i2_mv_y = mvy_qpel + 1;
3752
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3753
3754
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3755
3756
                            if(i4_tot_cost < i4_min_cost)
3757
                            {
3758
                                e_min_id = PT_BL;
3759
                                i4_min_cost = i4_tot_cost;
3760
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3761
                            }
3762
                        }
3763
                    }
3764
                    /* i4_mv_x and i4_mv_y will always be the centre pt */
3765
                    /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3766
                    if(i4_grid_mask & BIT_EN(PT_BR))
3767
                    {
3768
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3769
                            ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
3770
3771
                        if(!check_for_duplicate)
3772
                        {
3773
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3774
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3775
3776
                            /* Carry out the interpolation */
3777
                            pf_qpel_interp(
3778
                                &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
3779
3780
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3781
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3782
3783
                            pf_err_compute(&s_err_prms);
3784
                            /* Update the mv's with the current candt motion vectors */
3785
                            s_result_prms.i2_mv_x = mvx_qpel + 1;
3786
                            s_result_prms.i2_mv_y = mvy_qpel + 1;
3787
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3788
3789
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3790
3791
                            if(i4_tot_cost < i4_min_cost)
3792
                            {
3793
                                e_min_id = PT_BR;
3794
                                i4_min_cost = i4_tot_cost;
3795
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3796
                            }
3797
                        }
3798
                    }
3799
                    if(e_min_id == PT_C)
3800
                    {
3801
                        break;
3802
                    }
3803
                }
3804
                else
3805
                {
3806
                    break;
3807
                }
3808
            }
3809
3810
            if(i4_i)
3811
            {
3812
                i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3813
            }
3814
            else
3815
            {
3816
                i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3817
            }
3818
            i4_mv_x += gai1_grid_id_to_x[e_min_id];
3819
            i4_mv_y += gai1_grid_id_to_y[e_min_id];
3820
            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3821
            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3822
            i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3823
        }
3824
    }
3825
3826
    /* update modified motion vectors and cost at end of subpel */
3827
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3828
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3829
    ps_search_node->i4_tot_cost = i4_min_cost;
3830
    ps_search_node->i4_sad = i4_min_sad;
3831
3832
    /********************************************************************************/
3833
    /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
3834
    /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
3835
    /********************************************************************************/
3836
    //ps_pred_ctxt->lambda >>= 1;
3837
3838
    return (i4_min_cost);
3839
}
3840
#endif
3841
3842
static void hme_subpel_refine_struct_to_search_results_struct_converter(
3843
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
3844
    search_results_t *ps_search_results,
3845
    U08 u1_pred_dir,
3846
    ME_QUALITY_PRESETS_T e_quality_preset)
3847
170k
{
3848
170k
    U08 i;
3849
3850
170k
    U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
3851
3852
1.56M
    for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
3853
1.39M
    {
3854
1.39M
        S32 index;
3855
1.39M
        S32 i4_sad;
3856
3857
1.39M
        S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
3858
3859
1.39M
        search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
3860
3861
1.39M
        if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
3862
1.21M
        {
3863
1.21M
            index = part_id;
3864
1.21M
        }
3865
177k
        else
3866
177k
        {
3867
177k
            index = i;
3868
177k
        }
3869
3870
1.39M
        if(!ps_best_node->u1_subpel_done)
3871
733k
        {
3872
733k
            i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3873
733k
                     ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3874
733k
            ps_best_node[0].i4_sdi = 0;
3875
733k
            ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
3876
733k
            ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3877
3878
733k
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3879
708
            {
3880
708
                i4_sad = MAX_SIGNED_16BIT_VAL;
3881
708
            }
3882
3883
733k
            ps_best_node[0].i4_sad = i4_sad;
3884
733k
            ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3885
733k
            ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3886
733k
            ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3887
733k
            ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3888
733k
            ps_best_node->u1_subpel_done = 1;
3889
3890
733k
            if(2 == u1_num_results_per_part)
3891
0
            {
3892
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3893
0
                         ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3894
0
                ps_best_node[1].i4_sdi = 0;
3895
0
                ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3896
3897
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3898
0
                {
3899
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3900
0
                }
3901
3902
0
                ps_best_node[1].i4_sad = i4_sad;
3903
0
                ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3904
0
                ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3905
0
                ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3906
0
                ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3907
0
                ps_best_node[1].u1_subpel_done = 1;
3908
0
            }
3909
733k
        }
3910
656k
        else if(
3911
656k
            (2 == u1_num_results_per_part) &&
3912
0
            (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
3913
0
        {
3914
0
            if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
3915
0
            {
3916
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3917
0
                         ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3918
0
                ps_best_node[0].i4_sdi = 0;
3919
0
                ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3920
3921
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3922
0
                {
3923
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3924
0
                }
3925
3926
0
                ps_best_node[0].i4_sad = i4_sad;
3927
0
                ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3928
0
                ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3929
0
                ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3930
0
                ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3931
3932
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3933
0
                         ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3934
0
                ps_best_node[1].i4_sdi = 0;
3935
0
                ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3936
3937
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3938
0
                {
3939
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3940
0
                }
3941
3942
0
                ps_best_node[1].i4_sad = i4_sad;
3943
0
                ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3944
0
                ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3945
0
                ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3946
0
                ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3947
0
            }
3948
0
            else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
3949
0
            {
3950
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
3951
0
                {
3952
0
                    i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3953
0
                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3954
0
                    ps_best_node[1].i4_sdi = 0;
3955
0
                    ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3956
3957
0
                    if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3958
0
                    {
3959
0
                        i4_sad = MAX_SIGNED_16BIT_VAL;
3960
0
                    }
3961
3962
0
                    ps_best_node[1].i4_sad = i4_sad;
3963
0
                    ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3964
0
                    ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3965
0
                    ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3966
0
                    ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3967
0
                }
3968
0
                else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
3969
0
                {
3970
0
                    memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
3971
3972
0
                    i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3973
0
                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3974
0
                    ps_best_node[0].i4_sdi = 0;
3975
0
                    ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3976
3977
0
                    if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3978
0
                    {
3979
0
                        i4_sad = MAX_SIGNED_16BIT_VAL;
3980
0
                    }
3981
3982
0
                    ps_best_node[0].i4_sad = i4_sad;
3983
0
                    ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3984
0
                    ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3985
0
                    ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3986
0
                    ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3987
0
                }
3988
0
            }
3989
0
        }
3990
656k
        else if(
3991
656k
            (1 == u1_num_results_per_part) &&
3992
656k
            (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
3993
160k
        {
3994
160k
            i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3995
160k
                     ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3996
160k
            ps_best_node[0].i4_sdi = 0;
3997
160k
            ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3998
3999
160k
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
4000
0
            {
4001
0
                i4_sad = MAX_SIGNED_16BIT_VAL;
4002
0
            }
4003
4004
160k
            ps_best_node[0].i4_sad = i4_sad;
4005
160k
            ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4006
160k
            ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4007
160k
            ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4008
160k
            ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
4009
160k
        }
4010
1.39M
    }
4011
170k
}
4012
4013
/**
4014
********************************************************************************
4015
*  @fn     S32 hme_subpel_refine_cu_hs
4016
*
4017
*  @brief  Evaluates the best subpel mvs for active partitions of an MB in L0
4018
*          layer for the high speed preset. Recursive hadamard SATD / SAD
4019
*          and mv cost is used for 2NxN and NxN partitions with active partition
4020
*          update
4021
*
4022
*  @param[in]  ps_prms: subpel prms input to this function
4023
*
4024
*  @param[in]  ps_curr_layer: points to the current layer ctxt
4025
*
4026
*  @param[out] ps_search_results: points to the search resutls that get updated
4027
*              with best results
4028
*
4029
*  @param[in]  search_idx:  ref id of the frame for which results get updated
4030
*
4031
*  @param[in]  ps_wt_inp_prms:  current frame input params
4032
*
4033
*  @return     None
4034
********************************************************************************
4035
*/
4036
void hme_subpel_refine_cu_hs(
4037
    hme_subpel_prms_t *ps_prms,
4038
    layer_ctxt_t *ps_curr_layer,
4039
    search_results_t *ps_search_results,
4040
    S32 search_idx,
4041
    wgt_pred_ctxt_t *ps_wt_inp_prms,
4042
    WORD32 blk_8x8_mask,
4043
    me_func_selector_t *ps_func_selector,
4044
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
4045
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
4046
170k
{
4047
    /* Unique search node list for 2nx2n and nxn partitions */
4048
170k
    search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
4049
170k
    subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
4050
170k
    search_node_t *ps_search_node;
4051
4052
170k
    S32 i, i4_part_mask, j;
4053
170k
    S32 i4_sad_grid;
4054
170k
    S32 max_subpel_cand;
4055
170k
    WORD32 index;
4056
170k
    S32 num_unique_nodes_2nx2n;
4057
170k
    S32 part_id;
4058
170k
    S32 x_off, y_off;
4059
170k
    S32 i4_inp_off;
4060
4061
170k
    CU_SIZE_T e_cu_size;
4062
170k
    BLK_SIZE_T e_blk_size;
4063
4064
170k
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
4065
4066
170k
    S32 i4_use_satd = ps_prms->i4_use_satd;
4067
170k
    S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
4068
4069
170k
    ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
4070
4071
170k
    if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
4072
170k
    {
4073
170k
        e_cu_size = ps_search_results->e_cu_size;
4074
170k
        i4_part_mask = ps_search_results->i4_part_mask;
4075
4076
170k
        ps_prms->i4_inp_type = sizeof(U08);
4077
4078
170k
        num_unique_nodes_2nx2n = 0;
4079
4080
557k
        for(i = 0; i < i4_num_act_refs; i++)
4081
386k
        {
4082
386k
            as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
4083
386k
        }
4084
4085
        /************************************************************************/
4086
        /*                                                                      */
4087
        /*  Initialize SATD cost for each valid partition id.one time before    */
4088
        /*  doing full pel time. This is because of the following reasons:      */
4089
        /*   1. Full pel cost was done in  SAD while subpel is in SATD mode     */
4090
        /*   2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
4091
        /*      doing Diamond search for 2Nx2N and NxN. This partitions are     */
4092
        /*      not explicitly refine in high speed mode                        */
4093
        /*                                                                      */
4094
        /************************************************************************/
4095
1.56M
        for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4096
1.39M
        {
4097
1.39M
            S32 enable_subpel = 0;
4098
1.39M
            S32 part_type;
4099
4100
            /* Derive the x and y offsets of this part id */
4101
1.39M
            part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4102
1.39M
            if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4103
1.21M
            {
4104
1.21M
                index = part_id;
4105
1.21M
            }
4106
177k
            else
4107
177k
            {
4108
177k
                index = i;
4109
177k
            }
4110
4111
1.39M
            part_type = ge_part_id_to_part_type[part_id];
4112
1.39M
            x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
4113
1.39M
            y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
4114
1.39M
            x_off += ps_search_results->u1_x_off;
4115
1.39M
            y_off += ps_search_results->u1_y_off;
4116
1.39M
            i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4117
1.39M
            e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
4118
4119
1.39M
            x_off += ps_prms->i4_ctb_x_off;
4120
1.39M
            y_off += ps_prms->i4_ctb_y_off;
4121
4122
1.39M
            max_subpel_cand = 0;
4123
4124
            /* Choose the minimum number of candidates to be used for Sub pel refinement */
4125
1.39M
            if(PART_ID_2Nx2N == part_type)
4126
156k
            {
4127
156k
                max_subpel_cand =
4128
156k
                    MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
4129
156k
                        ps_search_results->u1_num_results_per_part);
4130
156k
            }
4131
1.23M
            else if(PRT_NxN == part_type)
4132
383k
            {
4133
383k
                max_subpel_cand = MIN(
4134
383k
                    ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
4135
383k
            }
4136
4137
            /* If incomplete CTB, NxN num candidates should be forced to min 1 */
4138
1.39M
            if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
4139
5.23k
            {
4140
5.23k
                max_subpel_cand = 1;
4141
5.23k
            }
4142
4143
1.39M
            if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
4144
539k
            {
4145
539k
                enable_subpel = 1;
4146
539k
            }
4147
4148
            /* Compute full pel SATD for each result per partition before subpel */
4149
            /* refinement starts.                                                */
4150
            /* Also prepare unique candidate list for 2Nx2N and NxN partitions   */
4151
2.78M
            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4152
1.39M
            {
4153
1.39M
                err_prms_t s_err_prms;
4154
1.39M
                S32 i4_satd = 0;
4155
1.39M
                S32 i1_ref_idx;
4156
1.39M
                U08 *pu1_ref_base;
4157
1.39M
                S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
4158
1.39M
                S32 i4_mv_x, i4_mv_y;
4159
4160
1.39M
                ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
4161
4162
1.39M
                if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
4163
0
                {
4164
0
                    ps_search_node->u1_subpel_done = 1;
4165
0
                    continue;
4166
0
                }
4167
4168
1.39M
                i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4169
1.39M
                ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
4170
1.39M
                pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
4171
4172
1.39M
                i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
4173
1.39M
                i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
4174
4175
1.39M
                if(i4_use_satd)
4176
1.26M
                {
4177
1.26M
                    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
4178
1.26M
                    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
4179
1.26M
                    s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
4180
1.26M
                                         (i4_mv_y * i4_ref_stride);
4181
4182
1.26M
                    s_err_prms.i4_ref_stride = i4_ref_stride;
4183
1.26M
                    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
4184
1.26M
                    s_err_prms.i4_grid_mask = 1;
4185
1.26M
                    s_err_prms.pi4_sad_grid = &i4_sad_grid;
4186
1.26M
                    s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
4187
1.26M
                    s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
4188
4189
1.26M
                    s_err_prms.ps_cmn_utils_optimised_function_list =
4190
1.26M
                        ps_cmn_utils_optimised_function_list;
4191
4192
1.26M
                    compute_satd_8bit(&s_err_prms);
4193
4194
1.26M
                    i4_satd = s_err_prms.pi4_sad_grid[0];
4195
4196
1.26M
                    ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
4197
1.26M
                        CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
4198
1.26M
                    ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
4199
1.26M
                }
4200
4201
                /* Sub-pel candidate filtration */
4202
1.39M
                if(j)
4203
0
                {
4204
0
                    S16 i2_best_sad;
4205
0
                    S32 i4_best_mvx;
4206
0
                    S32 i4_best_mvy;
4207
4208
0
                    search_node_t *ps_node =
4209
0
                        ps_search_results->aps_part_results[search_idx][part_id];
4210
4211
0
                    U08 u1_is_subpel_done = ps_node->u1_subpel_done;
4212
0
                    S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
4213
0
                    S32 i4_curr_mvx = i4_mv_x << 2;
4214
0
                    S32 i4_curr_mvy = i4_mv_y << 2;
4215
4216
0
                    if(u1_is_subpel_done)
4217
0
                    {
4218
0
                        i2_best_sad = ps_node->i4_sad;
4219
4220
0
                        if(ps_node->i1_ref_idx == i1_ref_idx)
4221
0
                        {
4222
0
                            i4_best_mvx = ps_node->s_mv.i2_mvx;
4223
0
                            i4_best_mvy = ps_node->s_mv.i2_mvy;
4224
0
                        }
4225
0
                        else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4226
0
                        {
4227
0
                            i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4228
0
                            i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4229
0
                        }
4230
0
                        else
4231
0
                        {
4232
0
                            i4_best_mvx = INTRA_MV;
4233
0
                            i4_best_mvy = INTRA_MV;
4234
0
                        }
4235
0
                    }
4236
0
                    else
4237
0
                    {
4238
0
                        i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
4239
0
                                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4240
4241
0
                        if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4242
0
                        {
4243
0
                            i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4244
0
                            i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4245
0
                        }
4246
0
                        else
4247
0
                        {
4248
0
                            i4_best_mvx = INTRA_MV;
4249
0
                            i4_best_mvy = INTRA_MV;
4250
0
                        }
4251
0
                    }
4252
4253
0
                    i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
4254
4255
0
                    if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
4256
0
                        (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
4257
0
                       (i2_curr_sad > i2_best_sad))
4258
0
                    {
4259
0
                        enable_subpel = 0;
4260
0
                    }
4261
0
                }
4262
4263
1.39M
                ps_search_node->u1_part_id = part_id;
4264
4265
                /* Convert mvs in part results from FPEL to QPEL units */
4266
1.39M
                ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
4267
1.39M
                ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
4268
4269
                /* If the candidate number is more than the number of candts
4270
                set initally, do not add those candts for refinement */
4271
1.39M
                if(j >= max_subpel_cand)
4272
868k
                {
4273
868k
                    enable_subpel = 0;
4274
868k
                }
4275
4276
1.39M
                if(enable_subpel)
4277
521k
                {
4278
521k
                    if(num_unique_nodes_2nx2n == 0)
4279
170k
                    {
4280
170k
                        S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4281
4282
170k
                        as_subpel_dedup_enabler[i4_index].i2_mv_x =
4283
170k
                            ps_subpel_refine_ctxt->i2_mv_x[j][index];
4284
170k
                        as_subpel_dedup_enabler[i4_index].i2_mv_y =
4285
170k
                            ps_subpel_refine_ctxt->i2_mv_y[j][index];
4286
170k
                        as_subpel_dedup_enabler[i4_index].u1_ref_idx =
4287
170k
                            (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4288
170k
                        memset(
4289
170k
                            as_subpel_dedup_enabler[i4_index].au4_node_map,
4290
170k
                            0,
4291
170k
                            sizeof(U32) * 2 * MAP_X_MAX);
4292
170k
                    }
4293
521k
                    INSERT_NEW_NODE_NOMAP_ALTERNATE(
4294
521k
                        as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
4295
521k
                }
4296
1.39M
            }
4297
4298
            /*********************************************************************************************/
4299
            /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
4300
            /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
4301
            /* for each partition again, based on the new costs                                          */
4302
            /*********************************************************************************************/
4303
            /*********************************************************************************************/
4304
            /* Because right now, we store only the two best candidates for each partition, the sort will*/
4305
            /* converge to a simple swap.                                                                */
4306
            /* ASSUMPTION : We store only two best results per partition                                 */
4307
            /*********************************************************************************************/
4308
1.39M
            if(ps_search_results->u1_num_results_per_part == 2)
4309
0
            {
4310
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
4311
0
                   ps_subpel_refine_ctxt->i2_tot_cost[1][index])
4312
0
                {
4313
0
                    SWAP(
4314
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index],
4315
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
4316
4317
0
                    SWAP(
4318
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index],
4319
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
4320
4321
0
                    SWAP(
4322
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index],
4323
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index]);
4324
4325
0
                    SWAP(
4326
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index],
4327
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index]);
4328
4329
0
                    SWAP(
4330
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index],
4331
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
4332
4333
0
                    SWAP(
4334
0
                        ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
4335
0
                        ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
4336
0
                }
4337
0
            }
4338
1.39M
        }
4339
4340
170k
        if(blk_8x8_mask == 0xf)
4341
156k
        {
4342
156k
            num_unique_nodes_2nx2n =
4343
156k
                MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
4344
156k
        }
4345
170k
        {
4346
170k
            x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
4347
170k
            y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
4348
170k
            x_off += ps_search_results->u1_x_off;
4349
170k
            y_off += ps_search_results->u1_y_off;
4350
170k
            i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4351
170k
            e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
4352
4353
445k
            for(j = 0; j < num_unique_nodes_2nx2n; j++)
4354
275k
            {
4355
275k
                S32 pred_lx;
4356
275k
                ps_search_node = &as_nodes_2nx2n[j];
4357
4358
275k
                if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
4359
0
                {
4360
0
                    continue;
4361
0
                }
4362
4363
275k
                {
4364
275k
                    S08 i1_ref_idx = ps_search_node->i1_ref_idx;
4365
275k
                    subpel_dedup_enabler_t *ps_dedup_enabler =
4366
275k
                        &(as_subpel_dedup_enabler[i1_ref_idx]);
4367
4368
275k
                    if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
4369
1.78k
                    {
4370
1.78k
                        as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
4371
1.78k
                        as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
4372
1.78k
                        as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
4373
1.78k
                        memset(
4374
1.78k
                            as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
4375
1.78k
                            0,
4376
1.78k
                            sizeof(U32) * 2 * MAP_X_MAX);
4377
1.78k
                    }
4378
275k
                }
4379
4380
275k
                pred_lx = search_idx;
4381
275k
                ps_prms->pv_inp =
4382
275k
                    (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
4383
4384
275k
                hme_subpel_refine_search_node_high_speed(
4385
275k
                    ps_search_node,
4386
275k
                    ps_prms,
4387
275k
                    ps_curr_layer,
4388
275k
                    e_blk_size,
4389
275k
                    x_off + ps_prms->i4_ctb_x_off,
4390
275k
                    y_off + ps_prms->i4_ctb_y_off,
4391
275k
                    ps_search_results,
4392
275k
                    pred_lx,
4393
275k
                    i4_part_mask,
4394
275k
                    &ps_subpel_refine_ctxt->ai4_part_id[0],
4395
275k
                    search_idx,
4396
275k
                    &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
4397
275k
                    ps_func_selector,
4398
275k
                    ps_me_optimised_function_list);
4399
275k
            }
4400
170k
        }
4401
170k
    }
4402
0
    else
4403
0
    {
4404
0
        for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4405
0
        {
4406
0
            S32 i4_index;
4407
4408
0
            S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4409
4410
0
            if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4411
0
            {
4412
0
                i4_index = i4_part_id;
4413
0
            }
4414
0
            else
4415
0
            {
4416
0
                i4_index = i;
4417
0
            }
4418
4419
0
            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4420
0
            {
4421
0
                ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
4422
0
                ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
4423
0
            }
4424
0
        }
4425
0
    }
4426
4427
170k
    hme_subpel_refine_struct_to_search_results_struct_converter(
4428
170k
        ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
4429
170k
}